org.apache.beam.sdk.io.FileIO Java Exaples

Source File: FileBasedDeadLetterQueueReconsumerTest.java From DataflowTemplates with Apache License 2.0

6 votes

@Test
public void testAllFilesAreConsumed() throws IOException {
  TestStream<String> inputFiles = TestStream.create(StringUtf8Coder.of())
      .addElements(
          createJsonFile("dlqFile1.json", JSON_FILE_CONTENTS_1),
          createJsonFile("dlqFile2.json", JSON_FILE_CONTENTS_1))
      .addElements(createJsonFile("dlqFile3.json", JSON_FILE_CONTENTS_1))
      .advanceWatermarkToInfinity();

  PCollection<String> jsonData = p.apply(inputFiles)
      .apply(FileIO.matchAll())
      .apply(FileBasedDeadLetterQueueReconsumer.moveAndConsumeMatches());

  PAssert.that(jsonData)
      .containsInAnyOrder(
          Stream.of(JSON_FILE_CONTENTS_1)
              .flatMap(line -> Stream.of(line, line, line))
              .collect(Collectors.toList()));

  p.run().waitUntilFinish();
}

Source File: TestExpansionService.java From beam with Apache License 2.0

6 votes

@Override
public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PCollection<GenericRecord>, PCollection<String>>() {
    @Override
    public PCollection<String> expand(PCollection<GenericRecord> input) {
      return input
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(schema))
                  .to(configuration.data))
          .getPerDestinationOutputFilenames()
          .apply(Values.create());
    }
  };
}

Source File: MyBeamJob.java From hazelcast-jet-demos with Apache License 2.0

6 votes

public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}

Source File: AvroTableFileAsMutationsTest.java From DataflowTemplates with Apache License 2.0

6 votes

private PCollection<FileShard> runFileShardingPipeline(Metadata fileMetadata, int splitSize) {

    PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
        p.apply(
                "Create File/Table names Map",
                Create.of(
                    ImmutableMap.<String, String>of(
                        fileMetadata.resourceId().toString(), "testtable")))
            .apply(View.asMap());

    return p.apply("Create Metadata", Create.of(fileMetadata))
        .apply(FileIO.readMatches())
        // Pcollection<FileIO.ReadableFile>
        .apply(
            "Split into ranges",
            ParDo.of(new SplitIntoRangesFn(splitSize, filenamesToTableNamesMapView))
                .withSideInputs(filenamesToTableNamesMapView))
        .setCoder(FileShard.Coder.of());
  }

Source File: XmlIO.java From beam with Apache License 2.0

6 votes

@Override
public PDone expand(PCollection<T> input) {
  checkArgument(getRecordClass() != null, "withRecordClass() is required");
  checkArgument(getRootElement() != null, "withRootElement() is required");
  checkArgument(getFilenamePrefix() != null, "to() is required");
  checkArgument(getCharset() != null, "withCharset() is required");
  try {
    JAXBContext.newInstance(getRecordClass());
  } catch (JAXBException e) {
    throw new RuntimeException("Error binding classes to a JAXB Context.", e);
  }

  ResourceId prefix =
      FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */);
  input.apply(
      FileIO.<T>write()
          .via(
              sink(getRecordClass())
                  .withCharset(Charset.forName(getCharset()))
                  .withRootElement(getRootElement()))
          .to(prefix.getCurrentDirectory().toString())
          .withPrefix(prefix.getFilename())
          .withSuffix(".xml")
          .withIgnoreWindowing());
  return PDone.in(input.getPipeline());
}

Source File: TikaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testParseAndParseFiles() throws IOException {
  Path root =
      Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent();

  List<ParseResult> expected =
      Arrays.asList(
          ParseResult.success(
              root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()),
          ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE));

  PCollection<ParseResult> parse =
      p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString()))
          .apply("FilterParse", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parse).containsInAnyOrder(expected);

  PCollection<ParseResult> parseFiles =
      p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString()))
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(TikaIO.parseFiles())
          .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn()));
  PAssert.that(parseFiles).containsInAnyOrder(expected);
  p.run();
}

Source File: ThriftIO.java From beam with Apache License 2.0

6 votes

@ProcessElement
public void processElement(@Element FileIO.ReadableFile file, OutputReceiver<T> out) {
  try {
    InputStream inputStream = Channels.newInputStream(file.open());
    TIOStreamTransport streamTransport =
        new TIOStreamTransport(new BufferedInputStream(inputStream));
    AutoExpandingBufferReadTransport readTransport =
        new AutoExpandingBufferReadTransport(262_144_000);
    readTransport.fill(streamTransport, inputStream.available());
    TProtocol protocol = tProtocol.getProtocol(readTransport);
    while (protocol.getTransport().getBytesRemainingInBuffer() > 0) {
      TBase<?, ?> tb = (TBase<?, ?>) tBaseType.getDeclaredConstructor().newInstance();
      tb.read(protocol);
      out.output((T) tb);
    }
  } catch (Exception ioe) {
    String filename = file.getMetadata().resourceId().toString();
    LOG.error(String.format("Error in reading file: %1$s%n%2$s", filename, ioe));
    throw new RuntimeException(ioe);
  }
}

Source File: ThriftIOTest.java From beam with Apache License 2.0

6 votes

/** Tests {@link ThriftIO#readFiles(Class)} with {@link TBinaryProtocol}. */
@Test
public void testReadFilesBinaryProtocol() {

  PCollection<TestThriftStruct> testThriftDoc =
      mainPipeline
          .apply(Create.of(THRIFT_DIR + "data").withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory));

  // Assert
  PAssert.that(testThriftDoc).containsInAnyOrder(TEST_THRIFT_STRUCT);

  // Execute pipeline
  mainPipeline.run();
}

Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}

Source File: WriteToGCSParquet.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as parquet file using {@link FileIO} and {@link ParquetIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       */
      .apply(
          "Writing as Parquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA))
              .to(outputDirectory())
              .withPrefix(outputFilenamePrefix())
              .withSuffix(
                  WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET))
              .withNumShards(numShards()));
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteAndReadUsingReflectDataSchemaWithDataModel() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .withAvroDataModel(GenericData.get())
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test(expected = org.apache.beam.sdk.Pipeline.PipelineExecutionException.class)
public void testWriteAndReadUsingReflectDataSchemaWithoutDataModelThrowsException() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}

Source File: ParquetIO.java From beam with Apache License 2.0

6 votes

@ProcessElement
public void processElement(ProcessContext processContext) throws Exception {
  FileIO.ReadableFile file = processContext.element();

  if (!file.getMetadata().isReadSeekEfficient()) {
    ResourceId filename = file.getMetadata().resourceId();
    throw new RuntimeException(String.format("File has to be seekable: %s", filename));
  }

  SeekableByteChannel seekableByteChannel = file.openSeekable();

  AvroParquetReader.Builder builder =
      AvroParquetReader.<GenericRecord>builder(new BeamParquetInputFile(seekableByteChannel));
  if (modelClass != null) {
    // all GenericData implementations have a static get method
    builder = builder.withDataModel((GenericData) modelClass.getMethod("get").invoke(null));
  }

  try (ParquetReader<GenericRecord> reader = builder.build()) {
    GenericRecord read;
    while ((read = reader.read()) != null) {
      processContext.output(read);
    }
  }
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteAndRead() {
  List<GenericRecord> records = generateGenericRecords(1000);

  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(SCHEMA))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteAndReadFiles() {
  List<GenericRecord> records = generateGenericRecords(1000);

  PCollection<GenericRecord> writeThenRead =
      mainPipeline
          .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(SCHEMA))
                  .to(temporaryFolder.getRoot().getAbsolutePath()))
          .getPerDestinationOutputFilenames()
          .apply(Values.create())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(SCHEMA));

  PAssert.that(writeThenRead).containsInAnyOrder(records);

  mainPipeline.run().waitUntilFinish();
}

Source File: TextStreamingPipeline.java From dlp-dataflow-deidentification with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException, GeneralSecurityException {

    TokenizePipelineOptions options =
        PipelineOptionsFactory.fromArgs(args).withValidation().as(TokenizePipelineOptions.class);

    Pipeline p = Pipeline.create(options);
    p.apply(
            FileIO.match()
                .filepattern(options.getInputFile())
                .continuously(
                    Duration.standardSeconds(options.getPollingInterval()), Watch.Growth.never()))
        .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
        .apply(
            "Text File Reader",
            ParDo.of(
                new TextFileReader(
                    options.as(GcpOptions.class).getProject(),
                    options.getFileDecryptKeyName(),
                    options.getFileDecryptKey(),
                    options.getBatchSize(),
                    options.getCsek(),
                    options.getCsekhash())))
        .apply(
            "Tokenize Data",
            ParDo.of(
                new TokenizeData(
                    options.as(GcpOptions.class).getProject(),
                    options.getDeidentifyTemplateName(),
                    options.getInspectTemplateName())))
        .apply(
            Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getInterval()))))
        .apply(new WriteOneFilePerWindow(options.getOutputFile(), 1));

    p.run();
  }

Source File: ParquetIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<GenericRecord> expand(PBegin input) {
  checkNotNull(getFilepattern(), "Filepattern cannot be null.");

  return input
      .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of()))
      .apply(FileIO.matchAll())
      .apply(FileIO.readMatches())
      .apply(readFiles(getSchema()).withAvroDataModel(getAvroDataModel()));
}

Source File: TestExpansionService.java From beam with Apache License 2.0

5 votes

@Override
public PTransform<PBegin, PCollection<GenericRecord>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PBegin, PCollection<GenericRecord>>() {
    @Override
    public PCollection<GenericRecord> expand(PBegin input) {
      return input
          .apply(FileIO.match().filepattern(configuration.data))
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(schema))
          .setCoder(AvroCoder.of(schema));
    }
  };
}

Source File: Transforms.java From nomulus with Apache License 2.0

5 votes

/**
 * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}.
 */
public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() {
  return new PTransform<PCollection<String>, PCollection<Metadata>>() {
    @Override
    public PCollection<Metadata> expand(PCollection<String> input) {
      return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
    }
  };
}

Source File: XmlIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testWriteThenReadLarger() {
  List<Bird> birds = Lists.newArrayList();
  for (int i = 0; i < 100; ++i) {
    birds.add(new Bird("Testing", "Bird number " + i));
  }
  mainPipeline
      .apply(Create.of(birds))
      .apply(
          FileIO.<Bird>write()
              .via(XmlIO.sink(Bird.class).withRootElement("birds"))
              .to(tmpFolder.getRoot().getAbsolutePath())
              .withPrefix("birds")
              .withSuffix(".xml")
              .withNumShards(1));
  mainPipeline.run();

  PCollection<Bird> readBack =
      readPipeline.apply(
          XmlIO.<Bird>read()
              .from(new File(tmpFolder.getRoot(), "birds").getAbsolutePath() + "*")
              .withRecordClass(Bird.class)
              .withRootElement("birds")
              .withRecordElement("bird")
              .withMinBundleSize(100));

  PAssert.that(readBack).containsInAnyOrder(birds);

  readPipeline.run();
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<ParseResult> expand(PBegin input) {
  return input
      .apply(FileIO.match().filepattern(getFilepattern()))
      .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
      .apply(parseFiles());
}

Source File: ThriftIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getRecordClass(), "Record class cannot be null");
  checkNotNull(getTProtocolFactory(), "Thrift protocol cannot be null");

  return input
      .apply(ParDo.of(new ReadFn<>(getRecordClass(), getTProtocolFactory())))
      .setCoder(ThriftCoder.of(getRecordClass(), getTProtocolFactory()));
}

Source File: ThriftIOTest.java From beam with Apache License 2.0

5 votes

/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TBinaryProtocol}.
 */
@Test
public void testReadWriteBinaryProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tBinaryProtoFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tBinaryProtoFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tBinaryProtoFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}

Source File: ThriftIOTest.java From beam with Apache License 2.0

5 votes

/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TJSONProtocol}.
 */
@Test
public void testReadWriteJsonProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tJsonProtocolFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tJsonProtocolFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tJsonProtocolFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}

Source File: ThriftIOTest.java From beam with Apache License 2.0

5 votes

/**
 * Tests {@link ThriftIO#sink(TProtocolFactory)} and {@link ThriftIO#readFiles(Class)} with {@link
 * TCompactProtocol}.
 */
@Test
public void testReadWriteCompactProtocol() {

  mainPipeline
      .apply(
          Create.of(testThriftStructs)
              .withCoder(ThriftCoder.of(TestThriftStruct.class, tCompactProtocolFactory)))
      .apply(
          FileIO.<TestThriftStruct>write()
              .via(ThriftIO.sink(tCompactProtocolFactory))
              .to(temporaryFolder.getRoot().getAbsolutePath()));

  // Execute write pipeline
  mainPipeline.run().waitUntilFinish();

  // Read written files
  PCollection<TestThriftStruct> readDocs =
      readPipeline
          .apply(
              Create.of(temporaryFolder.getRoot().getAbsolutePath() + "/*")
                  .withCoder(StringUtf8Coder.of()))
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(
              ThriftIO.readFiles(TestThriftStruct.class).withProtocol(tCompactProtocolFactory));

  // Assert
  PAssert.that(readDocs).containsInAnyOrder(testThriftStructs);

  // Execute read pipeline
  readPipeline.run().waitUntilFinish();
}

Source File: Transforms.java From nomulus with Apache License 2.0

5 votes

/**
 * Returns a {@link PTransform} from file {@link Metadata} to {@link VersionedEntity} using
 * caller-provided {@code transformer}.
 */
static PTransform<PCollection<Metadata>, PCollection<VersionedEntity>> processFiles(
    DoFn<ReadableFile, VersionedEntity> transformer) {
  return new PTransform<PCollection<Metadata>, PCollection<VersionedEntity>>() {
    @Override
    public PCollection<VersionedEntity> expand(PCollection<Metadata> input) {
      return input
          .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED))
          .apply(transformer.getClass().getSimpleName(), ParDo.of(transformer));
      // TODO(weiminyu): reshuffle to enable dynamic work rebalance per beam dev guide
    }
  };
}

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PBegin input) {
  checkArguments();

  String tmpDirName = makeTmpDirName();
  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName);

  PCollection<Void> emptyCollection = input.apply(Create.of((Void) null));

  PCollection<T> output =
      emptyCollection
          .apply(
              ParDo.of(
                  new CopyIntoStageFn(
                      getDataSourceProviderFn(),
                      getQuery(),
                      getTable(),
                      getStorageIntegrationName(),
                      stagingBucketDir,
                      getSnowflakeService())))
          .apply(Reshuffle.viaRandomKey())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(readFiles())
          .apply(ParDo.of(new MapCsvToStringArrayFn()))
          .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper())));

  output.setCoder(getCoder());

  emptyCollection
      .apply(Wait.on(output))
      .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir)));
  return output;
}

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

private PCollection<String> writeFiles(PCollection<T> input, String stagingBucketDir) {

      PCollection<String> mappedUserData =
          input
              .apply(
                  MapElements.via(
                      new SimpleFunction<T, Object[]>() {
                        @Override
                        public Object[] apply(T element) {
                          return getUserDataMapper().mapRow(element);
                        }
                      }))
              .apply("Map Objects array to CSV lines", ParDo.of(new MapObjectsArrayToCsvFn()))
              .setCoder(StringUtf8Coder.of());

      WriteFilesResult filesResult =
          mappedUserData.apply(
              "Write files to specified location",
              FileIO.<String>write()
                  .via(TextIO.sink())
                  .to(stagingBucketDir)
                  .withPrefix(getFileNameTemplate())
                  .withSuffix(".csv")
                  .withCompression(Compression.GZIP));

      return (PCollection)
          filesResult
              .getPerDestinationOutputFilenames()
              .apply("Parse KV filenames to Strings", Values.<String>create());
    }

Source File: ParquetIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<GenericRecord> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getSchema(), "Schema can not be null");
  return input
      .apply(ParDo.of(new ReadFn(getAvroDataModel())))
      .setCoder(AvroCoder.of(getSchema()));
}

Source File: ImportTransform.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public PCollection<Export> expand(PBegin input) {
  NestedValueProvider<String, String> manifestFile =
      NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json"));
  return input
      .apply("Read manifest", FileIO.match().filepattern(manifestFile))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(Export.class))
              .via(ReadExportManifestFile::readManifest));
}

org.apache.beam.sdk.io.FileIO Java Examples