org.apache.beam.sdk.io.FileSystems Java Exaples

Source File: NexmarkUtils.java From beam with Apache License 2.0

7 votes

/** Frees any resources used to make the side input available. */
public static void cleanUpSideInput(NexmarkConfiguration config) throws IOException {
  switch (config.sideInputType) {
    case DIRECT:
      break;
    case CSV:
      FileSystems.delete(
          FileSystems.match(config.sideInputUrl + "*").metadata().stream()
              .map(metadata -> metadata.resourceId())
              .collect(Collectors.toList()));
      break;
    default:
      throw new IllegalArgumentException(
          String.format(
              "Unknown type of %s clean up requested", SideInputType.class.getSimpleName()));
  }
}

Source File: AvroByteReader.java From beam with Apache License 2.0

6 votes

@Override
public AvroByteFileIterator iterator() throws IOException {
  BoundedSource.BoundedReader<ByteBuffer> reader;
  if (startPosition == 0 && endPosition == Long.MAX_VALUE) {
    // Read entire file (or collection of files).
    reader = avroSource.createReader(options);
  } else {
    // Read a subrange of file.
    reader =
        avroSource
            .createForSubrangeOfFile(
                FileSystems.matchSingleFileSpec(filename), startPosition, endPosition)
            .createReader(options);
  }
  return new AvroByteFileIterator((AvroReader<ByteBuffer>) reader);
}

Source File: PackageUtil.java From beam with Apache License 2.0

6 votes

public static PackageAttributes forBytesToStage(
    byte[] bytes, String targetName, String stagingPath) {
  HashCode hashCode = Hashing.sha256().newHasher().putBytes(bytes).hash();
  long size = bytes.length;

  String target = Environments.createStagingFileName(new File(targetName), hashCode);

  String resourcePath =
      FileSystems.matchNewResource(stagingPath, true)
          .resolve(target, StandardResolveOptions.RESOLVE_FILE)
          .toString();
  DataflowPackage targetPackage = new DataflowPackage();
  targetPackage.setName(target);
  targetPackage.setLocation(resourcePath);

  return new AutoValue_PackageUtil_PackageAttributes(
      null, bytes, targetPackage, size, hashCode.toString());
}

Source File: FileUtils.java From deployment-examples with MIT License

6 votes

public static String copyFile(ResourceId sourceFile, ResourceId destinationFile)
    throws IOException {

  try (WritableByteChannel writeChannel = FileSystems.create(destinationFile, "text/plain")) {
    try (ReadableByteChannel readChannel = FileSystems.open(sourceFile)) {

      final ByteBuffer buffer = ByteBuffer.allocateDirect(16 * 1024);
      while (readChannel.read(buffer) != -1) {
        buffer.flip();
        writeChannel.write(buffer);
        buffer.compact();
      }
      buffer.flip();
      while (buffer.hasRemaining()) {
        writeChannel.write(buffer);
      }
    }
  }

  return destinationFile.toString();
}

Source File: IsmReaderFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
public void testFactory() throws Exception {
  WindowedValueCoder<?> coder =
      WindowedValue.getFullCoder(
          IsmRecordCoder.of(
              1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()),
          GlobalWindow.Coder.INSTANCE);

  String tmpFile = tmpFolder.newFile().getPath();
  ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId();
  @SuppressWarnings("rawtypes")
  IsmReader<?> ismReader =
      (IsmReader)
          new IsmReaderFactory()
              .create(
                  createSpecForFilename(tmpFile),
                  coder,
                  options,
                  executionContext,
                  operationContext);
  assertEquals(coder.getValueCoder(), ismReader.getCoder());
  assertEquals(tmpResourceId, ismReader.getResourceId());
}

Source File: CsvToAvro.java From java-docs-samples with Apache License 2.0

6 votes

public static String getSchema(String schemaPath) throws IOException {
  ReadableByteChannel chan = FileSystems.open(FileSystems.matchNewResource(
      schemaPath, false));

  try (InputStream stream = Channels.newInputStream(chan)) {
    BufferedReader streamReader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
    StringBuilder dataBuilder = new StringBuilder();

    String line;
    while ((line = streamReader.readLine()) != null) {
      dataBuilder.append(line);
    }

    return dataBuilder.toString();
  }
}

Source File: BeamJpaModuleTest.java From nomulus with Apache License 2.0

6 votes

/**
 * Integration test with a GCP project, only run when the 'test.gcp_integration.env' property is
 * defined. Otherwise this test is ignored. This is meant to be run from a developer's desktop,
 * with auth already set up by gcloud.
 *
 * <p>Example: {@code gradlew test -P test.gcp_integration.env=alpha}.
 *
 * <p>See <a href="../../../../../../../../java_common.gradle">java_common.gradle</a> for more
 * information.
 */
@Test
public void getJpaTransactionManager_cloudSql_authRequired() {
  String environmentName = System.getProperty("test.gcp_integration.env");
  assumeThat(environmentName, notNullValue());

  FileSystems.setDefaultPipelineOptions(PipelineOptionsFactory.create());
  JpaTransactionManager jpa =
      DaggerBeamJpaModule_JpaTransactionManagerComponent.builder()
          .beamJpaModule(
              new BeamJpaModule(
                  BackupPaths.getCloudSQLCredentialFilePatterns(environmentName).get(0)))
          .build()
          .cloudSqlJpaTransactionManager();
  assertThat(
          jpa.transact(
              () -> jpa.getEntityManager().createNativeQuery("select 1").getSingleResult()))
      .isEqualTo(1);
}

Source File: ArtifactRetrievalService.java From beam with Apache License 2.0

6 votes

public static InputStream getArtifact(RunnerApi.ArtifactInformation artifact) throws IOException {
  switch (artifact.getTypeUrn()) {
    case FILE_ARTIFACT_URN:
      RunnerApi.ArtifactFilePayload payload =
          RunnerApi.ArtifactFilePayload.parseFrom(artifact.getTypePayload());
      return Channels.newInputStream(
          FileSystems.open(
              FileSystems.matchNewResource(payload.getPath(), false /* is directory */)));
    case EMBEDDED_ARTIFACT_URN:
      return RunnerApi.EmbeddedFilePayload.parseFrom(artifact.getTypePayload())
          .getData()
          .newInput();
    default:
      throw new UnsupportedOperationException(
          "Unexpected artifact type: " + artifact.getTypeUrn());
  }
}

Source File: GeoCityLookup.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

/**
 * Returns a singleton object for reading from the GeoCity database.
 *
 * <p>We copy the configured database file to a static temp location so that the MaxMind API can
 * save on heap usage by using memory mapping. The reader is threadsafe and this singleton pattern
 * allows multiple worker threads on the same machine to share a single reader instance.
 *
 * <p>Note that we do not clean up the temp mmdb file, but it's a static path, so running locally
 * will overwrite the existing path every time rather than creating an unbounded number of copies.
 * This also assumes that only one JVM per machine is running this code. In the production case
 * where this is running on Cloud Dataflow, we should always have a clean environment and the temp
 * state will be cleaned up along with the workers once the job finishes. However, behavior is
 * undefined if you run multiple local jobs concurrently.
 *
 * @throws IOException if the configured file path is not a valid .mmdb file
 */
private static synchronized DatabaseReader getOrCreateSingletonGeoCityReader(
    ValueProvider<String> geoCityDatabase) throws IOException {
  if (singletonGeoCityReader == null) {
    File mmdb;
    try {
      InputStream inputStream;
      Metadata metadata = FileSystems.matchSingleFileSpec(geoCityDatabase.get());
      ReadableByteChannel channel = FileSystems.open(metadata.resourceId());
      inputStream = Channels.newInputStream(channel);
      Path mmdbPath = Paths.get(System.getProperty("java.io.tmpdir"), "GeoCityLookup.mmdb");
      Files.copy(inputStream, mmdbPath, StandardCopyOption.REPLACE_EXISTING);
      mmdb = mmdbPath.toFile();
    } catch (IOException e) {
      throw new IOException("Exception thrown while fetching configured geoCityDatabase", e);
    }
    singletonGeoCityReader = new DatabaseReader.Builder(mmdb).withCache(new CHMCache()).build();
  }
  return singletonGeoCityReader;
}

Source File: BigQueryIO.java From beam with Apache License 2.0

6 votes

@Override
void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception {
  PipelineOptions options = c.getPipelineOptions();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  String jobUuid = c.getJobId();
  final String extractDestinationDir =
      resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", jobUuid);
  final String executingProject = bqOptions.getProject();
  JobReference jobRef =
      new JobReference()
          .setProjectId(executingProject)
          .setJobId(getExtractJobId(createJobIdToken(bqOptions.getJobName(), jobUuid)));

  Job extractJob = getBigQueryServices().getJobService(bqOptions).getJob(jobRef);

  if (extractJob != null) {
    List<ResourceId> extractFiles =
        getExtractFilePaths(extractDestinationDir, extractJob);
    if (extractFiles != null && !extractFiles.isEmpty()) {
      FileSystems.delete(
          extractFiles, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
    }
  }
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

private static DataflowPipelineOptions buildPipelineOptions() throws IOException {
  GcsUtil mockGcsUtil = mock(GcsUtil.class);
  when(mockGcsUtil.expand(any(GcsPath.class)))
      .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0]));
  when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true);

  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setRunner(DataflowRunner.class);
  options.setGcpCredential(new TestCredential());
  options.setJobName("some-job-name");
  options.setProject("some-project");
  options.setRegion("some-region");
  options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString());
  options.setFilesToStage(new ArrayList<>());
  options.setDataflowClient(buildMockDataflow(new IsValidCreateRequest()));
  options.setGcsUtil(mockGcsUtil);

  // Enable the FileSystems API to know about gs:// URIs in this test.
  FileSystems.setDefaultPipelineOptions(options);

  return options;
}

Source File: FilePatternMatchingShardedFile.java From beam with Apache License 2.0

6 votes

/**
 * Reads all the lines of all the files.
 *
 * <p>Not suitable for use except in testing of small data, since the data size may be far more
 * than can be reasonably processed serially, in-memory, by a single thread.
 */
@VisibleForTesting
List<String> readLines(Collection<Metadata> files) throws IOException {
  List<String> allLines = Lists.newArrayList();
  int i = 1;
  for (Metadata file : files) {
    try (Reader reader =
        Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) {
      List<String> lines = CharStreams.readLines(reader);
      allLines.addAll(lines);
      LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file);
    }
    i++;
  }
  return allLines;
}

Source File: WindowedFilenamePolicy.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Resolves any date variables which exist in the output directory path. This allows for the
 * dynamically changing of the output location based on the window end time.
 *
 * @return The new output directory with all variables resolved.
 */
private ResourceId resolveWithDateTemplates(
        ValueProvider<String> outputDirectoryStr, BoundedWindow window) {
    ResourceId outputDirectory = FileSystems.matchNewResource(outputDirectoryStr.get(), true);
    if (window instanceof IntervalWindow) {
        IntervalWindow intervalWindow = (IntervalWindow) window;
        DateTime time = intervalWindow.end().toDateTime();
        String outputPath = outputDirectory.toString();
        outputPath = outputPath.replace("YYYY", YEAR.print(time));
        outputPath = outputPath.replace("MM", MONTH.print(time));
        outputPath = outputPath.replace("DD", DAY.print(time));
        outputPath = outputPath.replace("HH", HOUR.print(time));
        outputPath = outputPath.replace("mm", MINUTE.print(time));
        outputDirectory = FileSystems.matchNewResource(outputPath, true);
    }
    return outputDirectory;
}

Source File: GCSUploadMain.java From beam with Apache License 2.0

6 votes

public static void main(String[] args) {
  DataflowPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class);
  FileSystems.setDefaultPipelineOptions(options);
  GcsStager stager = GcsStager.fromOptions(options);
  stager.stageFiles(
      options.getFilesToStage().stream()
          .map(
              (String source) -> {
                try {
                  File file = new File(source);
                  HashCode hashCode = Files.asByteSource(file).hash(Hashing.sha256());
                  return PackageUtil.StagedFile.of(
                      source,
                      hashCode.toString(),
                      Environments.createStagingFileName(file, hashCode));
                } catch (IOException e) {
                  throw new UncheckedIOException(e);
                }
              })
          .collect(Collectors.toList()));
}

Source File: TextToBigQueryStreaming.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Method to read a BigQuery schema file from GCS and return the file contents as a string.
 *
 * @param gcsPath Path string for the schema file in GCS.
 * @return File contents as a string.
 */
private static ValueProvider<String> getSchemaFromGCS(ValueProvider<String> gcsPath) {
  return NestedValueProvider.of(
      gcsPath,
      new SimpleFunction<String, String>() {
        @Override
        public String apply(String input) {
          ResourceId sourceResourceId = FileSystems.matchNewResource(input, false);

          String schema;
          try (ReadableByteChannel rbc = FileSystems.open(sourceResourceId)) {
            try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
              try (WritableByteChannel wbc = Channels.newChannel(baos)) {
                ByteStreams.copy(rbc, wbc);
                schema = baos.toString(Charsets.UTF_8.name());
                LOG.info("Extracted schema: " + schema);
              }
            }
          } catch (IOException e) {
            LOG.error("Error extracting schema: " + e.getMessage());
            throw new RuntimeException(e);
          }
          return schema;
        }
      });
}

Source File: TestUtils.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Helper to generate files for testing.
 *
 * @param filePath The path to the file to write.
 * @param lines The lines to write.
 * @param compression The compression type of the file.
 * @return The file written.
 * @throws IOException If an error occurs while creating or writing the file.
 */
public static ResourceId writeToFile(
    String filePath, List<String> lines, Compression compression) throws IOException {

  String fileContents = String.join(System.lineSeparator(), lines);

  ResourceId resourceId = FileSystems.matchNewResource(filePath, false);

  String mimeType =
      compression == Compression.UNCOMPRESSED ? MimeTypes.TEXT : MimeTypes.BINARY;

  // Write the file contents to the channel and close.
  try (ReadableByteChannel readChannel =
      Channels.newChannel(new ByteArrayInputStream(fileContents.getBytes()))) {
    try (WritableByteChannel writeChannel =
        compression.writeCompressed(FileSystems.create(resourceId, mimeType))) {
      ByteStreams.copy(readChannel, writeChannel);
    }
  }

  return resourceId;
}

Source File: IsmReaderTest.java From beam with Apache License 2.0

6 votes

@Test
public void testReadMissingKeys() throws Exception {
  File tmpFile = tmpFolder.newFile();
  List<IsmRecord<byte[]>> data = new ArrayList<>();
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x04}), EMPTY));
  data.add(IsmRecord.<byte[]>of(ImmutableList.of(EMPTY, new byte[] {0x08}), EMPTY));
  writeElementsToFile(data, tmpFile);

  IsmReader<byte[]> reader =
      new IsmReaderImpl<byte[]>(
          FileSystems.matchSingleFileSpec(tmpFile.getAbsolutePath()).resourceId(), CODER, cache);

  // Check that we got false with a key before all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x02})).start());
  // Check that we got false with a key between two other keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x06})).start());
  // Check that we got false with a key that is after all keys contained in the file.
  assertFalse(reader.overKeyComponents(ImmutableList.of(EMPTY, new byte[] {0x10})).start());
}

Source File: IsmSinkTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteNonContiguousShardsIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x01}, EMPTY), EMPTY)));

  expectedException.expect(IllegalStateException.class);
  expectedException.expectMessage("for shard which already exists");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(new byte[] {0x00}, EMPTY), EMPTY)));
}

Source File: BulkCompressorTest.java From DataflowTemplates with Apache License 2.0

6 votes

/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}

Source File: CsvToAvro.java From java-docs-samples with Apache License 2.0

6 votes

public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}

Source File: WindowedWordCountIT.java From beam with Apache License 2.0

6 votes

private WindowedWordCountITOptions defaultOptions() throws Exception {
  WindowedWordCountITOptions options =
      TestPipeline.testingPipelineOptions().as(WindowedWordCountITOptions.class);
  options.setInputFile(DEFAULT_INPUT);
  options.setTestTimeoutSeconds(1200L);

  options.setMinTimestampMillis(0L);
  options.setMinTimestampMillis(Duration.standardHours(1).getMillis());
  options.setWindowSize(10);

  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format(
                  "WindowedWordCountIT.%s-%tFT%<tH:%<tM:%<tS.%<tL+%s",
                  testName.getMethodName(), new Date(), ThreadLocalRandom.current().nextInt()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  return options;
}

Source File: TfIdfIT.java From beam with Apache License 2.0

6 votes

@Test
public void testE2ETfIdf() throws Exception {
  TfIdfITOptions options = TestPipeline.testingPipelineOptions().as(TfIdfITOptions.class);
  options.setInput(DEFAULT_INPUT);
  options.setOutput(
      FileSystems.matchNewResource(options.getTempRoot(), true)
          .resolve(
              String.format("TfIdfIT-%tF-%<tH-%<tM-%<tS-%<tL", new Date()),
              StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
          .resolve("results", StandardResolveOptions.RESOLVE_FILE)
          .toString());
  TfIdf.runTfIdf(options);

  assertThat(
      new NumberedShardedFile(options.getOutput() + "*-of-*.csv", DEFAULT_SHARD_TEMPLATE),
      fileContentsHaveChecksum(EXPECTED_OUTPUT_CHECKSUM));
}

Source File: IsmSinkTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteOutOfOrderKeysWithSameShardKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x01}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}

Source File: IsmSinkTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteKeyWhichIsProperPrefixOfPreviousSecondaryKeyIsError() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          CODER,
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00, 0x00}), EMPTY)));

  expectedException.expect(IllegalArgumentException.class);
  expectedException.expectMessage("expects keys to be written in strictly increasing order");
  sinkWriter.add(
      new ValueInEmptyWindows<>(IsmRecord.of(ImmutableList.of(EMPTY, new byte[] {0x00}), EMPTY)));
}

Source File: IsmSinkTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteEmptyKeyWithValueLargerThanBlockSize() throws Throwable {
  IsmSink<byte[]> sink =
      new IsmSink<>(
          FileSystems.matchNewResource(tmpFolder.newFile().getPath(), false),
          IsmRecordCoder.of(
              1, // We hash using only the window
              0, // There are no metadata records
              // We specifically use a coder that encodes to 0 bytes.
              ImmutableList.<Coder<?>>of(VoidCoder.of()),
              ByteArrayCoder.of()),
          BLOOM_FILTER_SIZE_LIMIT);
  SinkWriter<WindowedValue<IsmRecord<byte[]>>> sinkWriter = sink.writer();
  sinkWriter.add(
      new ValueInEmptyWindows<>(
          IsmRecord.of(
              Arrays.asList(new Object[] {null}), new byte[IsmSink.BLOCK_SIZE_BYTES * 2])));
  sinkWriter.close();
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@Setup
public void setup() throws Exception {
  if (spec.getTikaConfigPath() != null) {
    ResourceId configResource =
        FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId();
    tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource)));
  }
}

Source File: AvroByteReaderTest.java From beam with Apache License 2.0

5 votes

/** Write input elements to a file and return information about the Avro-encoded file. */
private <T> AvroFileInfo<T> initInputFile(List<List<T>> elemsList, Coder<T> coder)
    throws Exception {
  File tmpFile = tmpFolder.newFile("file.avro");
  AvroFileInfo<T> fileInfo = new AvroFileInfo<>();
  fileInfo.filename = tmpFile.getPath();

  // Write the data.
  OutputStream outStream =
      Channels.newOutputStream(
          FileSystems.create(
              FileSystems.matchNewResource(fileInfo.filename, false), MimeTypes.BINARY));
  Schema schema = Schema.create(Schema.Type.BYTES);
  DatumWriter<ByteBuffer> datumWriter = new GenericDatumWriter<>(schema);
  try (DataFileWriter<ByteBuffer> fileWriter = new DataFileWriter<>(datumWriter)) {
    fileWriter.create(schema, outStream);
    boolean first = true;
    for (List<T> elems : elemsList) {
      if (first) {
        first = false;
      } else {
        // Ensure a block boundary here.
        long syncPoint = fileWriter.sync();
        fileInfo.syncPoints.add(syncPoint);
      }
      for (T elem : elems) {
        byte[] encodedElement = CoderUtils.encodeToByteArray(coder, elem);
        fileWriter.append(ByteBuffer.wrap(encodedElement));
        fileInfo.elementSizes.add(encodedElement.length);
        fileInfo.totalElementEncodedSize += encodedElement.length;
      }
    }
  }

  return fileInfo;
}

Source File: SnowflakeIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws IOException {
  String combinedPath = stagingBucketDir + "/**";
  List<ResourceId> paths =
      FileSystems.match(combinedPath).metadata().stream()
          .map(metadata -> metadata.resourceId())
          .collect(Collectors.toList());

  FileSystems.delete(paths, MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
}

Source File: FilePatternMatchingShardedFile.java From beam with Apache License 2.0

5 votes

/** Discovers all shards of this file using the provided {@link Sleeper} and {@link BackOff}. */
@Override
public List<String> readFilesWithRetries(Sleeper sleeper, BackOff backOff)
    throws IOException, InterruptedException {
  IOException lastException = null;

  do {
    try {
      Collection<Metadata> files = FileSystems.match(filePattern).metadata();
      LOG.debug(
          "Found file(s) {} by matching the path: {}",
          files.stream()
              .map(Metadata::resourceId)
              .map(ResourceId::getFilename)
              .collect(Collectors.joining(",")),
          filePattern);
      if (files.isEmpty()) {
        continue;
      }
      // Read data from file paths
      return readLines(files);
    } catch (IOException e) {
      // Ignore and retry
      lastException = e;
      LOG.warn("Error in file reading. Ignore and retry.");
    }
  } while (BackOffUtils.next(sleeper, backOff));
  // Failed after max retries
  throw new IOException(
      String.format("Unable to read file(s) after retrying %d times", MAX_READ_RETRIES),
      lastException);
}

Source File: MetadataCoderV2Test.java From beam with Apache License 2.0

5 votes

@Test
public void testEncodeDecodeWithCustomLastModifiedMills() throws Exception {
  Path filePath = tmpFolder.newFile("somefile").toPath();
  Metadata metadata =
      Metadata.builder()
          .setResourceId(
              FileSystems.matchNewResource(filePath.toString(), false /* isDirectory */))
          .setIsReadSeekEfficient(true)
          .setSizeBytes(1024)
          .setLastModifiedMillis(1541097000L)
          .build();
  CoderProperties.coderDecodeEncodeEqual(MetadataCoderV2.of(), metadata);
}

org.apache.beam.sdk.io.FileSystems Java Examples