org.apache.beam.sdk.io.FileIO Java Examples
The following examples show how to use
org.apache.beam.sdk.io.FileIO.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileBasedDeadLetterQueueReconsumerTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Test public void testAllFilesAreConsumed() throws IOException { TestStream<String> inputFiles = TestStream.create(StringUtf8Coder.of()) .addElements( createJsonFile("dlqFile1.json", JSON_FILE_CONTENTS_1), createJsonFile("dlqFile2.json", JSON_FILE_CONTENTS_1)) .addElements(createJsonFile("dlqFile3.json", JSON_FILE_CONTENTS_1)) .advanceWatermarkToInfinity(); PCollection<String> jsonData = p.apply(inputFiles) .apply(FileIO.matchAll()) .apply(FileBasedDeadLetterQueueReconsumer.moveAndConsumeMatches()); PAssert.that(jsonData) .containsInAnyOrder( Stream.of(JSON_FILE_CONTENTS_1) .flatMap(line -> Stream.of(line, line, line)) .collect(Collectors.toList())); p.run().waitUntilFinish(); }
Example #2
Source File: TestExpansionService.java From beam with Apache License 2.0 | 6 votes |
@Override public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal( StringConfiguration configuration) { return new PTransform<PCollection<GenericRecord>, PCollection<String>>() { @Override public PCollection<String> expand(PCollection<GenericRecord> input) { return input .apply( FileIO.<GenericRecord>write() .via(ParquetIO.sink(schema)) .to(configuration.data)) .getPerDestinationOutputFilenames() .apply(Values.create()); } }; }
Example #3
Source File: MyBeamJob.java From hazelcast-jet-demos with Apache License 2.0 | 6 votes |
public static Pipeline build(PipelineOptions pipelineOptions) { Pipeline pipeline = Pipeline.create(pipelineOptions); pipeline .apply("unbounded-source", Read.from(new MyUnboundedSource("beam-input"))) .apply("reformat-and-timestamp", ParDo.of(new MyEnrichAndReformatFn())) .apply("window", Window.<String>into(FixedWindows.of(ONE_SECOND)) .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane())) .discardingFiredPanes() .withAllowedLateness(ONE_SECOND) ) .apply("sink", FileIO.<String>write() .via(TextIO.sink()) .to(".") .withPrefix("beam-output") .withNumShards(1) ) ; return pipeline; }
Example #4
Source File: AvroTableFileAsMutationsTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) { PCollectionView<Map<String, String>> filenamesToTableNamesMapView = p.apply( "Create File/Table names Map", Create.of( ImmutableMap.<String, String>of( fileMetadata.resourceId().toString(), "testtable"))) .apply(View.asMap()); return p.apply("Create Metadata", Create.of(fileMetadata)) .apply(FileIO.readMatches()) // Pcollection<FileIO.ReadableFile> .apply( "Split into ranges", ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView)) .withSideInputs(filenamesToTableNamesMapView)) .setCoder(FileShard.Coder.of()); }
Example #5
Source File: XmlIO.java From beam with Apache License 2.0 | 6 votes |
@Override public PDone expand(PCollection<T> input) { checkArgument(getRecordClass() != null, "withRecordClass() is required"); checkArgument(getRootElement() != null, "withRootElement() is required"); checkArgument(getFilenamePrefix() != null, "to() is required"); checkArgument(getCharset() != null, "withCharset() is required"); try { JAXBContext.newInstance(getRecordClass()); } catch (JAXBException e) { throw new RuntimeException("Error binding classes to a JAXB Context.", e); } ResourceId prefix = FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */); input.apply( FileIO.<T>write() .via( sink(getRecordClass()) .withCharset(Charset.forName(getCharset())) .withRootElement(getRootElement())) .to(prefix.getCurrentDirectory().toString()) .withPrefix(prefix.getFilename()) .withSuffix(".xml") .withIgnoreWindowing()); return PDone.in(input.getPipeline()); }
Example #6
Source File: TikaIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testParseAndParseFiles() throws IOException { Path root = Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent(); List<ParseResult> expected = Arrays.asList( ParseResult.success( root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()), ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE)); PCollection<ParseResult> parse = p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString())) .apply("FilterParse", ParDo.of(new FilterMetadataFn())); PAssert.that(parse).containsInAnyOrder(expected); PCollection<ParseResult> parseFiles = p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(TikaIO.parseFiles()) .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn())); PAssert.that(parseFiles).containsInAnyOrder(expected); p.run(); }
Example #7
Source File: ThriftIO.java From beam with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(@Element FileIO.ReadableFile file, OutputReceiver<T> out) { try { InputStream inputStream = Channels.newInputStream(file.open()); TIOStreamTransport streamTransport = new TIOStreamTransport(new BufferedInputStream(inputStream)); AutoExpandingBufferReadTransport readTransport = new AutoExpandingBufferReadTransport(262_144_000); readTransport.fill(streamTransport, inputStream.available()); TProtocol protocol = tProtocol.getProtocol(readTransport); while (protocol.getTransport().getBytesRemainingInBuffer() > 0) { TBase<?, ?> tb = (TBase<?, ?>) tBaseType.getDeclaredConstructor().newInstance(); tb.read(protocol); out.output((T) tb); } } catch (Exception ioe) { String filename = file.getMetadata().resourceId().toString(); LOG.error(String.format("Error in reading file: %1$s%n%2$s", filename, ioe)); throw new RuntimeException(ioe); } }
Example #8
Source File: ThriftIOTest.java From beam with Apache License 2.0 | 6 votes |
/** Tests {@link ThriftIO#readFiles(Class)} with {@link TBinaryProtocol}. */ @Test public void testReadFilesBinaryProtocol() { PCollection<TestThriftStruct> testThriftDoc = mainPipeline .apply(Create.of(THRIFT_DIR + "data").withCoder(StringUtf8Coder.of())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory)); // Assert PAssert.that(testThriftDoc).containsInAnyOrder(TEST_THRIFT_STRUCT); // Execute pipeline mainPipeline.run(); }
Example #9
Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public PCollectionTuple expand(PBegin input) { if (hasHeaders()) { return input .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec())) .apply("ReadMatches", FileIO.readMatches()) .apply( "ReadCsvWithHeaders", ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter())) .withOutputTags(headerTag(), TupleTagList.of(lineTag()))); } return PCollectionTuple.of( lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec()))); }
Example #10
Source File: WriteToGCSParquet.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) { return kafkaRecords /* * Converting KV<String, String> records to GenericRecord using DoFn and {@link * KeyValueToGenericRecordFn} class. */ .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn())) .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA)) /* * Writing as parquet file using {@link FileIO} and {@link ParquetIO}. * * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file. * The {@link withNumShards} option specifies the number of shards passed by the user. */ .apply( "Writing as Parquet", FileIO.<GenericRecord>write() .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA)) .to(outputDirectory()) .withPrefix(outputFilenamePrefix()) .withSuffix( WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET)) .withNumShards(numShards())); }
Example #11
Source File: ParquetIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testWriteAndReadUsingReflectDataSchemaWithDataModel() { Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class); List<GenericRecord> records = generateGenericRecords(1000); mainPipeline .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema))) .apply( FileIO.<GenericRecord>write() .via(ParquetIO.sink(testRecordSchema)) .to(temporaryFolder.getRoot().getAbsolutePath())); mainPipeline.run().waitUntilFinish(); PCollection<GenericRecord> readBack = readPipeline.apply( ParquetIO.read(testRecordSchema) .withAvroDataModel(GenericData.get()) .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); PAssert.that(readBack).containsInAnyOrder(records); readPipeline.run().waitUntilFinish(); }
Example #12
Source File: ParquetIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test(expected = org.apache.beam.sdk.Pipeline.PipelineExecutionException.class) public void testWriteAndReadUsingReflectDataSchemaWithoutDataModelThrowsException() { Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class); List<GenericRecord> records = generateGenericRecords(1000); mainPipeline .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema))) .apply( FileIO.<GenericRecord>write() .via(ParquetIO.sink(testRecordSchema)) .to(temporaryFolder.getRoot().getAbsolutePath())); mainPipeline.run().waitUntilFinish(); PCollection<GenericRecord> readBack = readPipeline.apply( ParquetIO.read(testRecordSchema) .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); PAssert.that(readBack).containsInAnyOrder(records); readPipeline.run().waitUntilFinish(); }
Example #13
Source File: ParquetIO.java From beam with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext processContext) throws Exception { FileIO.ReadableFile file = processContext.element(); if (!file.getMetadata().isReadSeekEfficient()) { ResourceId filename = file.getMetadata().resourceId(); throw new RuntimeException(String.format("File has to be seekable: %s", filename)); } SeekableByteChannel seekableByteChannel = file.openSeekable(); AvroParquetReader.Builder builder = AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel)); if (modelClass != null) { // all GenericData implementations have a static get method builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null)); } try (ParquetReader<GenericRecord> reader = builder.build()) { GenericRecord read; while ((read = reader.read()) != null) { processContext.output(read); } } }
Example #14
Source File: ParquetIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testWriteAndRead() { List<GenericRecord> records = generateGenericRecords(1000); mainPipeline .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))) .apply( FileIO.<GenericRecord>write() .via(ParquetIO.sink(SCHEMA)) .to(temporaryFolder.getRoot().getAbsolutePath())); mainPipeline.run().waitUntilFinish(); PCollection<GenericRecord> readBack = readPipeline.apply( ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); PAssert.that(readBack).containsInAnyOrder(records); readPipeline.run().waitUntilFinish(); }
Example #15
Source File: ParquetIOTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testWriteAndReadFiles() { List<GenericRecord> records = generateGenericRecords(1000); PCollection<GenericRecord> writeThenRead = mainPipeline .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))) .apply( FileIO.<GenericRecord>write() .via(ParquetIO.sink(SCHEMA)) .to(temporaryFolder.getRoot().getAbsolutePath())) .getPerDestinationOutputFilenames() .apply(Values.create()) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(ParquetIO.readFiles(SCHEMA)); PAssert.that(writeThenRead).containsInAnyOrder(records); mainPipeline.run().waitUntilFinish(); }
Example #16
Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, GeneralSecurityException { TokenizePipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply( FileIO.match() .filepattern(options.getInputFile()) .continuously( Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply( "Text File Reader", ParDo.of( new TextFileReader( options.as(GcpOptions.class).getProject(), options.getFileDecryptKeyName(), options.getFileDecryptKey(), options.getBatchSize(), options.getCsek(), options.getCsekhash()))) .apply( "Tokenize Data", ParDo.of( new TokenizeData( options.as(GcpOptions.class).getProject(), options.getDeidentifyTemplateName(), options.getInspectTemplateName()))) .apply( Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval())))) .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1)); p.run(); }
Example #17
Source File: ParquetIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<GenericRecord> expand(PBegin input) { checkNotNull(getFilepattern(), "Filepattern cannot be null."); return input .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(readFiles(getSchema()).withAvroDataModel(getAvroDataModel())); }
Example #18
Source File: TestExpansionService.java From beam with Apache License 2.0 | 5 votes |
@Override public PTransform<PBegin, PCollection<GenericRecord>> buildExternal( StringConfiguration configuration) { return new PTransform<PBegin, PCollection<GenericRecord>>() { @Override public PCollection<GenericRecord> expand(PBegin input) { return input .apply(FileIO.match().filepattern(configuration.data)) .apply(FileIO.readMatches()) .apply(ParquetIO.readFiles(schema)) .setCoder(AvroCoder.of(schema)); } }; }
Example #19
Source File: Transforms.java From nomulus with Apache License 2.0 | 5 votes |
/** * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}. */ public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() { return new PTransform<PCollection<String>, PCollection<Metadata>>() { @Override public PCollection<Metadata> expand(PCollection<String> input) { return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW)); } }; }
Example #20
Source File: XmlIOTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testWriteThenReadLarger() { List<Bird> birds = Lists.newArrayList(); for (int i = 0; i < 100; ++i) { birds.add(new Bird("Testing", "Bird number " + i)); } mainPipeline .apply(Create.of(birds)) .apply( FileIO.<Bird>write() .via(XmlIO.sink(Bird.class).withRootElement("birds")) .to(tmpFolder.getRoot().getAbsolutePath()) .withPrefix("birds") .withSuffix(".xml") .withNumShards(1)); mainPipeline.run(); PCollection<Bird> readBack = readPipeline.apply( XmlIO.<Bird>read() .from(new File(tmpFolder.getRoot(), "birds").getAbsolutePath() + "*") .withRecordClass(Bird.class) .withRootElement("birds") .withRecordElement("bird") .withMinBundleSize(100)); PAssert.that(readBack).containsInAnyOrder(birds); readPipeline.run(); }
Example #21
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<ParseResult> expand(PBegin input) { return input .apply(FileIO.match().filepattern(getFilepattern())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(parseFiles()); }
Example #22
Source File: ThriftIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<T> expand(PCollection<FileIO.ReadableFile> input) { checkNotNull(getRecordClass(), "Record class cannot be null"); checkNotNull(getTProtocolFactory(), "Thrift protocol cannot be null"); return input .apply(ParDo.of(new ReadFn<>(getRecordClass(), getTProtocolFactory()))) .setCoder(ThriftCoder.of(getRecordClass(), getTProtocolFactory())); }
Example #23
Source File: ThriftIOTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link * TBinaryProtocol}. */ @Test public void testReadWriteBinaryProtocol() { mainPipeline .apply( Create.of(testThriftStructs) .withCoder(ThriftCoder.of(TestThriftStruct.class, tBinaryProtoFactory))) .apply( FileIO.<TestThriftStruct>write() .via(ThriftIO.sink(tBinaryProtoFactory)) .to(temporaryFolder.getRoot().getAbsolutePath())); // Execute write pipeline mainPipeline.run().waitUntilFinish(); // Read written files PCollection<TestThriftStruct> readDocs = readPipeline .apply( Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*") .withCoder(StringUtf8Coder.of())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory)); // Assert PAssert.that(readDocs).containsInAnyOrder(testThriftStructs); // Execute read pipeline readPipeline.run().waitUntilFinish(); }
Example #24
Source File: ThriftIOTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link * TJSONProtocol}. */ @Test public void testReadWriteJsonProtocol() { mainPipeline .apply( Create.of(testThriftStructs) .withCoder(ThriftCoder.of(TestThriftStruct.class, tJsonProtocolFactory))) .apply( FileIO.<TestThriftStruct>write() .via(ThriftIO.sink(tJsonProtocolFactory)) .to(temporaryFolder.getRoot().getAbsolutePath())); // Execute write pipeline mainPipeline.run().waitUntilFinish(); // Read written files PCollection<TestThriftStruct> readDocs = readPipeline .apply( Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*") .withCoder(StringUtf8Coder.of())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tJsonProtocolFactory)); // Assert PAssert.that(readDocs).containsInAnyOrder(testThriftStructs); // Execute read pipeline readPipeline.run().waitUntilFinish(); }
Example #25
Source File: ThriftIOTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link * TCompactProtocol}. */ @Test public void testReadWriteCompactProtocol() { mainPipeline .apply( Create.of(testThriftStructs) .withCoder(ThriftCoder.of(TestThriftStruct.class, tCompactProtocolFactory))) .apply( FileIO.<TestThriftStruct>write() .via(ThriftIO.sink(tCompactProtocolFactory)) .to(temporaryFolder.getRoot().getAbsolutePath())); // Execute write pipeline mainPipeline.run().waitUntilFinish(); // Read written files PCollection<TestThriftStruct> readDocs = readPipeline .apply( Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*") .withCoder(StringUtf8Coder.of())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply( ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tCompactProtocolFactory)); // Assert PAssert.that(readDocs).containsInAnyOrder(testThriftStructs); // Execute read pipeline readPipeline.run().waitUntilFinish(); }
Example #26
Source File: Transforms.java From nomulus with Apache License 2.0 | 5 votes |
/** * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using * caller-provided {@code transformer}. */ static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles( DoFn<ReadableFile, VersionedEntity> transformer) { return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() { @Override public PCollection<VersionedEntity> expand(PCollection<Metadata> input) { return input .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer)); // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide } }; }
Example #27
Source File: SnowflakeIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<T> expand(PBegin input) { checkArguments(); String tmpDirName = makeTmpDirName(); String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName); PCollection<Void> emptyCollection = input.apply(Create.of((Void) null)); PCollection<T> output = emptyCollection .apply( ParDo.of( new CopyIntoStageFn( getDataSourceProviderFn(), getQuery(), getTable(), getStorageIntegrationName(), stagingBucketDir, getSnowflakeService()))) .apply(Reshuffle.viaRandomKey()) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply(readFiles()) .apply(ParDo.of(new MapCsvToStringArrayFn())) .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper()))); output.setCoder(getCoder()); emptyCollection .apply(Wait.on(output)) .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir))); return output; }
Example #28
Source File: SnowflakeIO.java From beam with Apache License 2.0 | 5 votes |
private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) { PCollection<String> mappedUserData = input .apply( MapElements.via( new SimpleFunction<T, Object[]>() { @Override public Object[] apply(T element) { return getUserDataMapper().mapRow(element); } })) .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn())) .setCoder(StringUtf8Coder.of()); WriteFilesResult filesResult = mappedUserData.apply( "Write files to specified location", FileIO.<String>write() .via(TextIO.sink()) .to(stagingBucketDir) .withPrefix(getFileNameTemplate()) .withSuffix(".csv") .withCompression(Compression.GZIP)); return (PCollection) filesResult .getPerDestinationOutputFilenames() .apply("Parse KV filenames to Strings", Values.<String>create()); }
Example #29
Source File: ParquetIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<GenericRecord> expand(PCollection<FileIO.ReadableFile> input) { checkNotNull(getSchema(), "Schema can not be null"); return input .apply(ParDo.of(new ReadFn(getAvroDataModel()))) .setCoder(AvroCoder.of(getSchema())); }
Example #30
Source File: ImportTransform.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PCollection<Export> expand(PBegin input) { NestedValueProvider<String, String> manifestFile = NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json")); return input .apply("Read manifest", FileIO.match().filepattern(manifestFile)) .apply( "Resource id", MapElements.into(TypeDescriptor.of(ResourceId.class)) .via((MatchResult.Metadata::resourceId))) .apply( "Read manifest json", MapElements.into(TypeDescriptor.of(Export.class)) .via(ReadExportManifestFile::readManifest)); }