org.apache.beam.sdk.io.fs.EmptyMatchTreatment Java Examples

The following examples show how to use org.apache.beam.sdk.io.fs.EmptyMatchTreatment. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testEmptyFilepatternTreatmentAllow() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  TestFileBasedSource source =
      new TestFileBasedSource(
          new File(tempFolder.getRoot(), "doesNotExist").getPath(),
          EmptyMatchTreatment.ALLOW,
          64,
          null);
  TestFileBasedSource sourceWithWildcard =
      new TestFileBasedSource(
          new File(tempFolder.getRoot(), "doesNotExist*").getPath(),
          EmptyMatchTreatment.ALLOW_IF_WILDCARD,
          64,
          null);
  assertEquals(0, readFromSource(source, options).size());
  assertEquals(0, readFromSource(sourceWithWildcard, options).size());
}
 
Example #2
Source File: AvroIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <T> AvroSource<T> createSource(
    ValueProvider<String> filepattern,
    EmptyMatchTreatment emptyMatchTreatment,
    Class<T> recordClass,
    Schema schema,
    @Nullable AvroSource.DatumReaderFactory<T> readerFactory) {
  AvroSource<?> source =
      AvroSource.from(filepattern).withEmptyMatchTreatment(emptyMatchTreatment);

  if (readerFactory != null) {
    source = source.withDatumReaderFactory(readerFactory);
  }
  return recordClass == GenericRecord.class
      ? (AvroSource<T>) source.withSchema(schema)
      : source.withSchema(recordClass);
}
 
Example #3
Source File: Transforms.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransform} from file name patterns to file {@link Metadata Metadata records}.
 */
public static PTransform<PCollection<String>, PCollection<Metadata>> getFilesByPatterns() {
  return new PTransform<PCollection<String>, PCollection<Metadata>>() {
    @Override
    public PCollection<Metadata> expand(PCollection<String> input) {
      return input.apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
    }
  };
}
 
Example #4
Source File: FileIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public Watch.Growth.PollResult<MatchResult.Metadata> apply(String element, Context c)
    throws Exception {
  Instant now = Instant.now();
  return Watch.Growth.PollResult.incomplete(
          now, FileSystems.match(element, EmptyMatchTreatment.ALLOW).metadata())
      .withWatermark(now);
}
 
Example #5
Source File: TextIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * A {@link PTransform} that reads from one or more text files and returns a bounded {@link
 * PCollection} containing one element for each line of the input files.
 */
public static Read read() {
  return new AutoValue_TextIO_Read.Builder()
      .setCompression(Compression.AUTO)
      .setHintMatchesManyFiles(false)
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
      .build();
}
 
Example #6
Source File: AvroSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Reads from the given file name or pattern ("glob"). The returned source needs to be further
 * configured by calling {@link #withSchema} to return a type other than {@link GenericRecord}.
 */
public static AvroSource<GenericRecord> from(ValueProvider<String> fileNameOrPattern) {
  return new AvroSource<>(
      fileNameOrPattern,
      EmptyMatchTreatment.DISALLOW,
      DEFAULT_MIN_BUNDLE_SIZE,
      readGenericRecordsWithSchema(null /* will need to be specified in withSchema */, null));
}
 
Example #7
Source File: AvroSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Constructor for FILEPATTERN mode. */
private AvroSource(
    ValueProvider<String> fileNameOrPattern,
    EmptyMatchTreatment emptyMatchTreatment,
    long minBundleSize,
    Mode<T> mode) {
  super(fileNameOrPattern, emptyMatchTreatment, minBundleSize);
  this.mode = mode;
}
 
Example #8
Source File: WriteFilesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<DestinationT, String>> input) {
  input
      .apply(Values.create())
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
  return PDone.in(input.getPipeline());
}
 
Example #9
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
public TestFileBasedSource(
    String fileOrPattern,
    EmptyMatchTreatment emptyMatchTreatment,
    long minBundleSize,
    String splitHeader) {
  super(StaticValueProvider.of(fileOrPattern), emptyMatchTreatment, minBundleSize);
  this.splitHeader = splitHeader;
}
 
Example #10
Source File: FileBasedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Create a {@code FileBaseSource} based on a file or a file pattern specification, with the given
 * strategy for treating filepatterns that do not match any files.
 */
protected FileBasedSource(
    ValueProvider<String> fileOrPatternSpec,
    EmptyMatchTreatment emptyMatchTreatment,
    long minBundleSize) {
  super(0, Long.MAX_VALUE, minBundleSize);
  this.mode = Mode.FILEPATTERN;
  this.emptyMatchTreatment = emptyMatchTreatment;
  this.fileOrPatternSpec = fileOrPatternSpec;
}
 
Example #11
Source File: FileBasedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testEmptyFilepatternTreatmentAllowIfWildcard() throws IOException {
  PipelineOptions options = PipelineOptionsFactory.create();
  TestFileBasedSource source =
      new TestFileBasedSource(
          new File(tempFolder.getRoot(), "doesNotExist").getPath(),
          EmptyMatchTreatment.ALLOW_IF_WILDCARD,
          64,
          null);
  thrown.expect(FileNotFoundException.class);
  readFromSource(source, options);
}
 
Example #12
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public FileBasedSource<T> apply(String input) {
  return Read.createSource(
      StaticValueProvider.of(input),
      EmptyMatchTreatment.DISALLOW,
      recordClass,
      schemaSupplier.get(),
      readerFactory);
}
 
Example #13
Source File: FileIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMatchDisallowEmptyExplicit() throws IOException {
  p.apply(
      FileIO.match()
          .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*")
          .withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));

  thrown.expectCause(isA(FileNotFoundException.class));
  p.run();
}
 
Example #14
Source File: FileIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMatchDisallowEmptyNonWildcard() throws IOException {
  p.apply(
      FileIO.match()
          .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah")
          .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD));

  thrown.expectCause(isA(FileNotFoundException.class));
  p.run();
}
 
Example #15
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Reads Avro file(s) containing records of an unspecified schema and converting each record to a
 * custom type.
 */
public static <T> Parse<T> parseGenericRecords(SerializableFunction<GenericRecord, T> parseFn) {
  return new AutoValue_AvroIO_Parse.Builder<T>()
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
      .setParseFn(parseFn)
      .setHintMatchesManyFiles(false)
      .build();
}
 
Example #16
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Reads Avro file(s) containing records of the specified schema. */
public static Read<GenericRecord> readGenericRecords(Schema schema) {
  return new AutoValue_AvroIO_Read.Builder<GenericRecord>()
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
      .setRecordClass(GenericRecord.class)
      .setSchema(schema)
      .setInferBeamSchema(false)
      .setHintMatchesManyFiles(false)
      .build();
}
 
Example #17
Source File: AvroTableFileAsMutations.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Mutation> expand(PCollection<KV<String, String>> filesToTables) {

  // Map<filename,tablename>
  PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
      filesToTables.apply("asView", View.asMap());

  return filesToTables
      .apply("Get Filenames", Keys.create())
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // Pcollection<FileIO.ReadableFile>
      .apply(
          "Split into ranges",
          ParDo.of(
                  new SplitIntoRangesFn(
                      SplitIntoRangesFn.DEFAULT_BUNDLE_SIZE, filenamesToTableNamesMapView))
              .withSideInputs(filenamesToTableNamesMapView))
      .setCoder(FileShard.Coder.of())
      // PCollection<FileShard>
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      // PCollection<FileShard>

      .apply("Read ranges", ParDo.of(new ReadFileRangesFn(ddlView)).withSideInputs(ddlView));
}
 
Example #18
Source File: ImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, TableManifest>> expand(PCollection<KV<String, String>> input) {
  return input.apply(
      "Read table manifest",
      ParDo.of(
          new DoFn<KV<String, String>, KV<String, TableManifest>>() {

            @ProcessElement
            public void processElement(ProcessContext c) {
              try {
                KV<String, String> kv = c.element();
                String filePath = GcsUtil.joinPath(importDirectory.get(), kv.getValue());
                MatchResult match = FileSystems.match(filePath, EmptyMatchTreatment.DISALLOW);
                ResourceId resourceId = match.metadata().get(0).resourceId();
                TableManifest.Builder builder = TableManifest.newBuilder();
                try (InputStream stream =
                    Channels.newInputStream(FileSystems.open(resourceId))) {
                  Reader reader = new InputStreamReader(stream);
                  JsonFormat.parser().merge(reader, builder);
                }
                c.output(KV.of(kv.getKey(), builder.build()));
              } catch (IOException e) {
                throw new RuntimeException(e);
              }
            }
          }));
}
 
Example #19
Source File: TextSourceTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<String> files) {
  return files
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // PCollection<FileIO.ReadableFile>
      .apply("Read lines", ParDo.of(new FileReadDoFn()));
  // PCollection<String>: line
}
 
Example #20
Source File: TextIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private static TextSource prepareSource(
    TemporaryFolder temporaryFolder, byte[] data, byte[] delimiter) throws IOException {
  Path path = temporaryFolder.newFile().toPath();
  Files.write(path, data);
  return new TextSource(
      ValueProvider.StaticValueProvider.of(path.toString()),
      EmptyMatchTreatment.DISALLOW,
      delimiter);
}
 
Example #21
Source File: TextSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<String> files) {
  return files
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // PCollection<FileIO.ReadableFile>
      .apply("Read lines", ParDo.of(new FileReadDoFn()));
  // PCollection<String>: line
}
 
Example #22
Source File: BlockBasedSource.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Like {@link #BlockBasedSource(String, EmptyMatchTreatment, long)}. */
public BlockBasedSource(
    ValueProvider<String> fileOrPatternSpec,
    EmptyMatchTreatment emptyMatchTreatment,
    long minBundleSize) {
  super(fileOrPatternSpec, emptyMatchTreatment, minBundleSize);
}
 
Example #23
Source File: FileIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMatchAllDisallowEmptyNonWildcard() throws IOException {
  p.apply(Create.of(tmpFolder.getRoot().getAbsolutePath() + "/blah"))
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD));
  thrown.expectCause(isA(FileNotFoundException.class));
  p.run();
}
 
Example #24
Source File: FileSystems.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Like {@link #match(List)}, but with a configurable {@link EmptyMatchTreatment}. */
public static List<MatchResult> match(List<String> specs, EmptyMatchTreatment emptyMatchTreatment)
    throws IOException {
  List<MatchResult> matches = getFileSystemInternal(getOnlyScheme(specs)).match(specs);
  List<MatchResult> res = Lists.newArrayListWithExpectedSize(matches.size());
  for (int i = 0; i < matches.size(); i++) {
    res.add(maybeAdjustEmptyMatchResult(specs.get(i), matches.get(i), emptyMatchTreatment));
  }
  return res;
}
 
Example #25
Source File: FileIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMatchAllDisallowEmptyExplicit() throws IOException {
  p.apply(Create.of(tmpFolder.getRoot().getAbsolutePath() + "/*"))
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW));
  thrown.expectCause(isA(FileNotFoundException.class));
  p.run();
}
 
Example #26
Source File: FileSystems.java    From beam with Apache License 2.0 5 votes vote down vote up
private static MatchResult maybeAdjustEmptyMatchResult(
    String spec, MatchResult res, EmptyMatchTreatment emptyMatchTreatment) throws IOException {
  if (res.status() == Status.NOT_FOUND
      || (res.status() == Status.OK && res.metadata().isEmpty())) {
    boolean notFoundAllowed =
        emptyMatchTreatment == EmptyMatchTreatment.ALLOW
            || (hasGlobWildcard(spec)
                && emptyMatchTreatment == EmptyMatchTreatment.ALLOW_IF_WILDCARD);
    return notFoundAllowed
        ? MatchResult.create(Status.OK, Collections.emptyList())
        : MatchResult.create(
            Status.NOT_FOUND, new FileNotFoundException("No files matched spec: " + spec));
  }
  return res;
}
 
Example #27
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Reads records of the given type from an Avro file (or multiple Avro files matching a pattern).
 *
 * <p>The schema must be specified using one of the {@code withSchema} functions.
 */
public static <T> Read<T> read(Class<T> recordClass) {
  return new AutoValue_AvroIO_Read.Builder<T>()
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW))
      .setRecordClass(recordClass)
      .setSchema(ReflectData.get().getSchema(recordClass))
      .setInferBeamSchema(false)
      .setHintMatchesManyFiles(false)
      .build();
}
 
Example #28
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Like {@link #read}, but reads each filepattern in the input {@link PCollection}.
 *
 * @deprecated You can achieve The functionality of {@link #readAll} using {@link FileIO} matching
 *     plus {@link #readFiles(Class)}. This is the preferred method to make composition explicit.
 *     {@link ReadAll} will not receive upgrades and will be removed in a future version of Beam.
 */
@Deprecated
public static <T> ReadAll<T> readAll(Class<T> recordClass) {
  return new AutoValue_AvroIO_ReadAll.Builder<T>()
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD))
      .setRecordClass(recordClass)
      .setSchema(ReflectData.get().getSchema(recordClass))
      .setInferBeamSchema(false)
      .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES)
      .build();
}
 
Example #29
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Like {@link #readGenericRecords(Schema)}, but for a {@link PCollection} of {@link
 * FileIO.ReadableFile}, for example, returned by {@link FileIO#readMatches}.
 *
 * @deprecated You can achieve The functionality of {@link #readAllGenericRecords(Schema)} using
 *     {@link FileIO} matching plus {@link #readFilesGenericRecords(Schema)}. This is the
 *     preferred method to make composition explicit. {@link ReadAll} will not receive upgrades
 *     and will be removed in a future version of Beam.
 */
@Deprecated
public static ReadAll<GenericRecord> readAllGenericRecords(Schema schema) {
  return new AutoValue_AvroIO_ReadAll.Builder<GenericRecord>()
      .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD))
      .setRecordClass(GenericRecord.class)
      .setSchema(schema)
      .setInferBeamSchema(false)
      .setDesiredBundleSizeBytes(DEFAULT_BUNDLE_SIZE_BYTES)
      .build();
}
 
Example #30
Source File: FileIOTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMatchAndMatchAll() throws IOException {
  Path firstPath = tmpFolder.newFile("first").toPath();
  Path secondPath = tmpFolder.newFile("second").toPath();
  int firstSize = 37;
  int secondSize = 42;
  long firstModified = 1541097000L;
  long secondModified = 1541098000L;
  Files.write(firstPath, new byte[firstSize]);
  Files.write(secondPath, new byte[secondSize]);
  Files.setLastModifiedTime(firstPath, FileTime.fromMillis(firstModified));
  Files.setLastModifiedTime(secondPath, FileTime.fromMillis(secondModified));
  MatchResult.Metadata firstMetadata = metadata(firstPath, firstSize, firstModified);
  MatchResult.Metadata secondMetadata = metadata(secondPath, secondSize, secondModified);

  PAssert.that(
          p.apply(
              "Match existing",
              FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*")))
      .containsInAnyOrder(firstMetadata, secondMetadata);
  PAssert.that(
          p.apply(
              "Match existing with provider",
              FileIO.match()
                  .filepattern(p.newProvider(tmpFolder.getRoot().getAbsolutePath() + "/*"))))
      .containsInAnyOrder(firstMetadata, secondMetadata);
  PAssert.that(
          p.apply("Create existing", Create.of(tmpFolder.getRoot().getAbsolutePath() + "/*"))
              .apply("MatchAll existing", FileIO.matchAll()))
      .containsInAnyOrder(firstMetadata, secondMetadata);

  PAssert.that(
          p.apply(
              "Match non-existing ALLOW",
              FileIO.match()
                  .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah")
                  .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW)))
      .containsInAnyOrder();
  PAssert.that(
          p.apply(
                  "Create non-existing",
                  Create.of(tmpFolder.getRoot().getAbsolutePath() + "/blah"))
              .apply(
                  "MatchAll non-existing ALLOW",
                  FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW)))
      .containsInAnyOrder();

  PAssert.that(
          p.apply(
              "Match non-existing ALLOW_IF_WILDCARD",
              FileIO.match()
                  .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah*")
                  .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD)))
      .containsInAnyOrder();
  PAssert.that(
          p.apply(
                  "Create non-existing wildcard + explicit",
                  Create.of(tmpFolder.getRoot().getAbsolutePath() + "/blah*"))
              .apply(
                  "MatchAll non-existing ALLOW_IF_WILDCARD",
                  FileIO.matchAll()
                      .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD)))
      .containsInAnyOrder();
  PAssert.that(
          p.apply(
                  "Create non-existing wildcard + default",
                  Create.of(tmpFolder.getRoot().getAbsolutePath() + "/blah*"))
              .apply("MatchAll non-existing default", FileIO.matchAll()))
      .containsInAnyOrder();

  p.run();
}