org.apache.flink.core.fs.FileSystem#listStatus

Source File: BinaryInputFormat.java From flink with Apache License 2.0

6 votes

protected List<FileStatus> getFiles() throws IOException {
	// get all the files that are involved in the splits
	List<FileStatus> files = new ArrayList<>();

	for (Path filePath: getFilePaths()) {
		final FileSystem fs = filePath.getFileSystem();
		final FileStatus pathFile = fs.getFileStatus(filePath);

		if (pathFile.isDir()) {
			// input is directory. list all contained files
			final FileStatus[] partials = fs.listStatus(filePath);
			for (FileStatus partial : partials) {
				if (!partial.isDir()) {
					files.add(partial);
				}
			}
		} else {
			files.add(pathFile);
		}
	}
	return files;
}

Source File: StanfordTweetsDataSetInputFormat.java From flink-examples with MIT License

6 votes

@Override
public TweetFileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
    FileSystem fileSystem = getFileSystem();
    FileStatus[] statuses = fileSystem.listStatus(new Path(inputPath));
    logger.info("Found {} files", statuses.length);

    List<TweetFileInputSplit> splits = new ArrayList<>();
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        String fileName = status.getPath().getName();
        if (fileName.endsWith("edges")) {
            splits.add(new TweetFileInputSplit(i, status.getPath()));
        }
    }

    logger.info("Result number of splits: {}", splits.size());
    return splits.toArray(new TweetFileInputSplit[splits.size()]);
}

Source File: RocksDBIncrementalRestoreOperation.java From flink with Apache License 2.0

6 votes

/**
 * This recreates the new working directory of the recovered RocksDB instance and links/copies the contents from
 * a local state.
 */
private void restoreInstanceDirectoryFromPath(Path source, String instanceRocksDBPath) throws IOException {

	FileSystem fileSystem = source.getFileSystem();

	final FileStatus[] fileStatuses = fileSystem.listStatus(source);

	if (fileStatuses == null) {
		throw new IOException("Cannot list file statues. Directory " + source + " does not exist.");
	}

	for (FileStatus fileStatus : fileStatuses) {
		final Path filePath = fileStatus.getPath();
		final String fileName = filePath.getName();
		File restoreFile = new File(source.getPath(), fileName);
		File targetFile = new File(instanceRocksDBPath, fileName);
		if (fileName.endsWith(SST_FILE_SUFFIX)) {
			// hardlink'ing the immutable sst-files.
			Files.createLink(targetFile.toPath(), restoreFile.toPath());
		} else {
			// true copy for all other files.
			Files.copy(restoreFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
		}
	}
}

Source File: FileUtils.java From flink with Apache License 2.0

6 votes

private static void addToZip(Path fileOrDirectory, FileSystem fs, Path rootDir, ZipOutputStream out) throws IOException {
	String relativePath = fileOrDirectory.getPath().replace(rootDir.getPath() + '/', "");
	if (fs.getFileStatus(fileOrDirectory).isDir()) {
		out.putNextEntry(new ZipEntry(relativePath + '/'));
		for (FileStatus containedFile : fs.listStatus(fileOrDirectory)) {
			addToZip(containedFile.getPath(), fs, rootDir, out);
		}
	} else {
		ZipEntry entry = new ZipEntry(relativePath);
		out.putNextEntry(entry);

		try (FSDataInputStream in = fs.open(fileOrDirectory)) {
			IOUtils.copyBytes(in, out, false);
		}
		out.closeEntry();
	}
}

Source File: FileMonitoringFunction.java From flink with Apache License 2.0

6 votes

private List<String> listNewFiles(FileSystem fileSystem) throws IOException {
	List<String> files = new ArrayList<String>();

	FileStatus[] statuses = fileSystem.listStatus(new Path(path));

	if (statuses == null) {
		LOG.warn("Path does not exist: {}", path);
	} else {
		for (FileStatus status : statuses) {
			Path filePath = status.getPath();
			String fileName = filePath.getName();
			long modificationTime = status.getModificationTime();

			if (!isFiltered(fileName, modificationTime)) {
				files.add(filePath.toString());
				modificationTimes.put(fileName, modificationTime);
			}
		}
	}

	return files;
}

Source File: FileMonitoringFunction.java From flink with Apache License 2.0

6 votes

private List<String> listNewFiles(FileSystem fileSystem) throws IOException {
	List<String> files = new ArrayList<String>();

	FileStatus[] statuses = fileSystem.listStatus(new Path(path));

	if (statuses == null) {
		LOG.warn("Path does not exist: {}", path);
	} else {
		for (FileStatus status : statuses) {
			Path filePath = status.getPath();
			String fileName = filePath.getName();
			long modificationTime = status.getModificationTime();

			if (!isFiltered(fileName, modificationTime)) {
				files.add(filePath.toString());
				modificationTimes.put(fileName, modificationTime);
			}
		}
	}

	return files;
}

Source File: PartitionTempFileManager.java From flink with Apache License 2.0

6 votes

/**
 * Returns checkpoints whose keys are less than or equal to {@code toCpId}
 * in temporary base path.
 */
public static long[] headCheckpoints(FileSystem fs, Path basePath, long toCpId) throws IOException {
	List<Long> cps = new ArrayList<>();

	for (FileStatus taskStatus : fs.listStatus(basePath)) {
		String name = taskStatus.getPath().getName();
		if (isCheckpointDir(name)) {
			long currentCp = getCheckpointId(name);
			// commit paths that less than current checkpoint id.
			if (currentCp <= toCpId) {
				cps.add(currentCp);
			}
		}
	}
	return cps.stream().mapToLong(v -> v).toArray();
}

Source File: RocksDBIncrementalRestoreOperation.java From Flink-CEPplus with Apache License 2.0

6 votes

/**
 * This recreates the new working directory of the recovered RocksDB instance and links/copies the contents from
 * a local state.
 */
private void restoreInstanceDirectoryFromPath(Path source, String instanceRocksDBPath) throws IOException {

	FileSystem fileSystem = source.getFileSystem();

	final FileStatus[] fileStatuses = fileSystem.listStatus(source);

	if (fileStatuses == null) {
		throw new IOException("Cannot list file statues. Directory " + source + " does not exist.");
	}

	for (FileStatus fileStatus : fileStatuses) {
		final Path filePath = fileStatus.getPath();
		final String fileName = filePath.getName();
		File restoreFile = new File(source.getPath(), fileName);
		File targetFile = new File(instanceRocksDBPath, fileName);
		if (fileName.endsWith(SST_FILE_SUFFIX)) {
			// hardlink'ing the immutable sst-files.
			Files.createLink(targetFile.toPath(), restoreFile.toPath());
		} else {
			// true copy for all other files.
			Files.copy(restoreFile.toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
		}
	}
}

Source File: FileUtils.java From flink with Apache License 2.0

6 votes

private static void addToZip(Path fileOrDirectory, FileSystem fs, Path rootDir, ZipOutputStream out) throws IOException {
	String relativePath = fileOrDirectory.getPath().replace(rootDir.getPath() + '/', "");
	if (fs.getFileStatus(fileOrDirectory).isDir()) {
		out.putNextEntry(new ZipEntry(relativePath + '/'));
		for (FileStatus containedFile : fs.listStatus(fileOrDirectory)) {
			addToZip(containedFile.getPath(), fs, rootDir, out);
		}
	} else {
		ZipEntry entry = new ZipEntry(relativePath);
		out.putNextEntry(entry);

		try (FSDataInputStream in = fs.open(fileOrDirectory)) {
			IOUtils.copyBytes(in, out, false);
		}
		out.closeEntry();
	}
}

Source File: PartitionTempFileManager.java From flink with Apache License 2.0

5 votes

/**
 * Returns task temporary paths in this checkpoint.
 */
public static List<Path> listTaskTemporaryPaths(
		FileSystem fs, Path basePath, long checkpointId) throws Exception {
	List<Path> taskTmpPaths = new ArrayList<>();

	for (FileStatus taskStatus : fs.listStatus(new Path(basePath, checkpointName(checkpointId)))) {
		if (isTaskDir(taskStatus.getPath().getName())) {
			taskTmpPaths.add(taskStatus.getPath());
		}
	}
	return taskTmpPaths;
}

Source File: ContinuousFileMonitoringFunction.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Returns the paths of the files not yet processed.
 * @param fileSystem The filesystem where the monitored directory resides.
 */
private Map<Path, FileStatus> listEligibleFiles(FileSystem fileSystem, Path path) throws IOException {

	final FileStatus[] statuses;
	try {
		statuses = fileSystem.listStatus(path);
	} catch (IOException e) {
		// we may run into an IOException if files are moved while listing their status
		// delay the check for eligible files in this case
		return Collections.emptyMap();
	}

	if (statuses == null) {
		LOG.warn("Path does not exist: {}", path);
		return Collections.emptyMap();
	} else {
		Map<Path, FileStatus> files = new HashMap<>();
		// handle the new files
		for (FileStatus status : statuses) {
			if (!status.isDir()) {
				Path filePath = status.getPath();
				long modificationTime = status.getModificationTime();
				if (!shouldIgnore(filePath, modificationTime)) {
					files.put(filePath, status);
				}
			} else if (format.getNestedFileEnumeration() && format.acceptFile(status)){
				files.putAll(listEligibleFiles(fileSystem, status.getPath()));
			}
		}
		return files;
	}
}

Source File: ContinuousFileMonitoringFunction.java From flink with Apache License 2.0

5 votes

/**
 * Returns the paths of the files not yet processed.
 * @param fileSystem The filesystem where the monitored directory resides.
 */
private Map<Path, FileStatus> listEligibleFiles(FileSystem fileSystem, Path path) throws IOException {

	final FileStatus[] statuses;
	try {
		statuses = fileSystem.listStatus(path);
	} catch (IOException e) {
		// we may run into an IOException if files are moved while listing their status
		// delay the check for eligible files in this case
		return Collections.emptyMap();
	}

	if (statuses == null) {
		LOG.warn("Path does not exist: {}", path);
		return Collections.emptyMap();
	} else {
		Map<Path, FileStatus> files = new HashMap<>();
		// handle the new files
		for (FileStatus status : statuses) {
			if (!status.isDir()) {
				Path filePath = status.getPath();
				long modificationTime = status.getModificationTime();
				if (!shouldIgnore(filePath, modificationTime)) {
					files.put(filePath, status);
				}
			} else if (format.getNestedFileEnumeration() && format.acceptFile(status)){
				files.putAll(listEligibleFiles(fileSystem, status.getPath()));
			}
		}
		return files;
	}
}

Source File: PartitionPathUtils.java From flink with Apache License 2.0

5 votes

/**
 * List file status without hidden files.
 */
public static FileStatus[] listStatusWithoutHidden(FileSystem fs, Path dir) throws IOException {
	FileStatus[] statuses = fs.listStatus(dir);
	if (statuses == null) {
		return null;
	}
	return Arrays.stream(statuses).filter(fileStatus -> !isHiddenFile(fileStatus)).toArray(FileStatus[]::new);
}

Source File: FileUtils.java From flink with Apache License 2.0

5 votes

private static void internalCopyDirectory(Path sourcePath, Path targetPath, boolean executable, FileSystem sFS, FileSystem tFS) throws IOException {
	tFS.mkdirs(targetPath);
	FileStatus[] contents = sFS.listStatus(sourcePath);
	for (FileStatus content : contents) {
		String distPath = content.getPath().toString();
		if (content.isDir()) {
			if (distPath.endsWith("/")) {
				distPath = distPath.substring(0, distPath.length() - 1);
			}
		}
		String localPath = targetPath + distPath.substring(distPath.lastIndexOf("/"));
		copy(content.getPath(), new Path(localPath), executable);
	}
}

Source File: FileUtils.java From flink with Apache License 2.0

5 votes

private static void internalCopyDirectory(Path sourcePath, Path targetPath, boolean executable, FileSystem sFS, FileSystem tFS) throws IOException {
	tFS.mkdirs(targetPath);
	FileStatus[] contents = sFS.listStatus(sourcePath);
	for (FileStatus content : contents) {
		String distPath = content.getPath().toString();
		if (content.isDir()) {
			if (distPath.endsWith("/")) {
				distPath = distPath.substring(0, distPath.length() - 1);
			}
		}
		String localPath = targetPath + distPath.substring(distPath.lastIndexOf("/"));
		copy(content.getPath(), new Path(localPath), executable);
	}
}

Source File: ContinuousFileMonitoringFunction.java From flink with Apache License 2.0

5 votes

/**
 * Returns the paths of the files not yet processed.
 * @param fileSystem The filesystem where the monitored directory resides.
 */
private Map<Path, FileStatus> listEligibleFiles(FileSystem fileSystem, Path path) throws IOException {

	final FileStatus[] statuses;
	try {
		statuses = fileSystem.listStatus(path);
	} catch (IOException e) {
		// we may run into an IOException if files are moved while listing their status
		// delay the check for eligible files in this case
		return Collections.emptyMap();
	}

	if (statuses == null) {
		LOG.warn("Path does not exist: {}", path);
		return Collections.emptyMap();
	} else {
		Map<Path, FileStatus> files = new HashMap<>();
		// handle the new files
		for (FileStatus status : statuses) {
			if (!status.isDir()) {
				Path filePath = status.getPath();
				long modificationTime = status.getModificationTime();
				if (!shouldIgnore(filePath, modificationTime)) {
					files.put(filePath, status);
				}
			} else if (format.getNestedFileEnumeration() && format.acceptFile(status)){
				files.putAll(listEligibleFiles(fileSystem, status.getPath()));
			}
		}
		return files;
	}
}

Source File: HadoopSwiftFileSystemITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testDirectoryListing() throws Exception {
	final Configuration conf = createConfiguration();

	FileSystem.initialize(conf);

	final Path directory = new Path("swift://" + CONTAINER + '.' + SERVICENAME + '/' + TEST_DATA_DIR + "/testdir/");
	final FileSystem fs = directory.getFileSystem();

	// directory must not yet exist
	assertFalse(fs.exists(directory));

	try {
		// create directory
		assertTrue(fs.mkdirs(directory));

		// seems the file system does not assume existence of empty directories
		assertTrue(fs.exists(directory));

		// directory empty
		assertEquals(0, fs.listStatus(directory).length);

		// create some files
		final int numFiles = 3;
		for (int i = 0; i < numFiles; i++) {
			Path file = new Path(directory, "/file-" + i);
			try (FSDataOutputStream out = fs.create(file, FileSystem.WriteMode.NO_OVERWRITE);
				OutputStreamWriter writer = new OutputStreamWriter(out, StandardCharsets.UTF_8)) {
				writer.write("hello-" + i + "\n");
			}
		}

		FileStatus[] files = fs.listStatus(directory);
		assertNotNull(files);
		assertEquals(3, files.length);

		for (FileStatus status : files) {
			assertFalse(status.isDir());
		}

		// now that there are files, the directory must exist
		assertTrue(fs.exists(directory));
	}
	finally {
		// clean up
		fs.delete(directory, true);
	}

	// now directory must be gone
	assertFalse(fs.exists(directory));
}

Source File: StanfordTweetsDataSetInputFormat.java From flink-examples with MIT License

4 votes

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStatistics) throws IOException {
    FileSystem fileSystem = getFileSystem();
    FileStatus[] statuses = fileSystem.listStatus(new Path(inputPath));
    return new GraphStatistics(statuses.length);
}

Source File: AbstractFileCheckpointStorageTestBase.java From flink with Apache License 2.0

4 votes

/**
 * Validates that multiple checkpoints from different jobs with the same checkpoint ID do not
 * interfere with each other.
 */
@Test
public void testPersistMultipleMetadataOnlyCheckpoints() throws Exception {
	final FileSystem fs = FileSystem.getLocalFileSystem();
	final Path checkpointDir = new Path(tmp.newFolder().toURI());

	final long checkpointId = 177;

	final CheckpointStorage storage1 = createCheckpointStorage(checkpointDir);
	storage1.initializeBaseLocations();
	final CheckpointStorage storage2 = createCheckpointStorage(checkpointDir);
	storage2.initializeBaseLocations();

	final CheckpointStorageLocation loc1 = storage1.initializeLocationForCheckpoint(checkpointId);
	final CheckpointStorageLocation loc2 = storage2.initializeLocationForCheckpoint(checkpointId);

	final byte[] data1 = {77, 66, 55, 99, 88};
	final byte[] data2 = {1, 3, 2, 5, 4};

	final CompletedCheckpointStorageLocation completedLocation1;
	try (CheckpointMetadataOutputStream out = loc1.createMetadataOutputStream()) {
		out.write(data1);
		completedLocation1 = out.closeAndFinalizeCheckpoint();
	}
	final String result1 = completedLocation1.getExternalPointer();

	final CompletedCheckpointStorageLocation completedLocation2;
	try (CheckpointMetadataOutputStream out = loc2.createMetadataOutputStream()) {
		out.write(data2);
		completedLocation2 = out.closeAndFinalizeCheckpoint();
	}
	final String result2 = completedLocation2.getExternalPointer();

	// check that this went to a file, but in a nested directory structure

	// one directory per storage
	FileStatus[] files = fs.listStatus(checkpointDir);
	assertEquals(2, files.length);

	// in each per-storage directory, one for the checkpoint
	FileStatus[] job1Files = fs.listStatus(files[0].getPath());
	FileStatus[] job2Files = fs.listStatus(files[1].getPath());
	assertTrue(job1Files.length >= 1);
	assertTrue(job2Files.length >= 1);

	assertTrue(fs.exists(new Path(result1, AbstractFsCheckpointStorage.METADATA_FILE_NAME)));
	assertTrue(fs.exists(new Path(result2, AbstractFsCheckpointStorage.METADATA_FILE_NAME)));

	// check that both storages can resolve each others contents
	validateContents(storage1.resolveCheckpoint(result1).getMetadataHandle(), data1);
	validateContents(storage1.resolveCheckpoint(result2).getMetadataHandle(), data2);
	validateContents(storage2.resolveCheckpoint(result1).getMetadataHandle(), data1);
	validateContents(storage2.resolveCheckpoint(result2).getMetadataHandle(), data2);
}

Source File: BlobServerRecoveryTest.java From flink with Apache License 2.0

4 votes

/**
 * Helper to test that the {@link BlobServer} recovery from its HA store works.
 *
 * <p>Uploads two BLOBs to one {@link BlobServer} and expects a second one to be able to retrieve
 * them via a shared HA store upon request of a {@link BlobCacheService}.
 *
 * @param config
 * 		blob server configuration (including HA settings like {@link HighAvailabilityOptions#HA_STORAGE_PATH}
 * 		and {@link HighAvailabilityOptions#HA_CLUSTER_ID}) used to set up <tt>blobStore</tt>
 * @param blobStore
 * 		shared HA blob store to use
 *
 * @throws IOException
 * 		in case of failures
 */
public static void testBlobServerRecovery(final Configuration config, final BlobStore blobStore) throws IOException {
	final String clusterId = config.getString(HighAvailabilityOptions.HA_CLUSTER_ID);
	String storagePath = config.getString(HighAvailabilityOptions.HA_STORAGE_PATH) + "/" + clusterId;
	Random rand = new Random();

	try (
		BlobServer server0 = new BlobServer(config, blobStore);
		BlobServer server1 = new BlobServer(config, blobStore);
		// use VoidBlobStore as the HA store to force download from server[1]'s HA store
		BlobCacheService cache1 = new BlobCacheService(
			config, new VoidBlobStore(), new InetSocketAddress("localhost", server1.getPort())
		)) {

		server0.start();
		server1.start();

		// Random data
		byte[] expected = new byte[1024];
		rand.nextBytes(expected);
		byte[] expected2 = Arrays.copyOfRange(expected, 32, 288);

		BlobKey[] keys = new BlobKey[2];
		BlobKey nonHAKey;

		// Put job-related HA data
		JobID[] jobId = new JobID[] { new JobID(), new JobID() };
		keys[0] = put(server0, jobId[0], expected, PERMANENT_BLOB); // Request 1
		keys[1] = put(server0, jobId[1], expected2, PERMANENT_BLOB); // Request 2

		// put non-HA data
		nonHAKey = put(server0, jobId[0], expected2, TRANSIENT_BLOB);
		verifyKeyDifferentHashEquals(keys[1], nonHAKey);

		// check that the storage directory exists
		final Path blobServerPath = new Path(storagePath, "blob");
		FileSystem fs = blobServerPath.getFileSystem();
		assertTrue("Unknown storage dir: " + blobServerPath, fs.exists(blobServerPath));

		// Verify HA requests from cache1 (connected to server1) with no immediate access to the file
		verifyContents(cache1, jobId[0], keys[0], expected);
		verifyContents(cache1, jobId[1], keys[1], expected2);

		// Verify non-HA file is not accessible from server1
		verifyDeleted(cache1, jobId[0], nonHAKey);

		// Remove again
		server1.cleanupJob(jobId[0], true);
		server1.cleanupJob(jobId[1], true);

		// Verify everything is clean
		assertTrue("HA storage directory does not exist", fs.exists(new Path(storagePath)));
		if (fs.exists(blobServerPath)) {
			final org.apache.flink.core.fs.FileStatus[] recoveryFiles =
				fs.listStatus(blobServerPath);
			ArrayList<String> filenames = new ArrayList<>(recoveryFiles.length);
			for (org.apache.flink.core.fs.FileStatus file: recoveryFiles) {
				filenames.add(file.toString());
			}
			fail("Unclean state backend: " + filenames);
		}
	}
}

Java Code Examples for org.apache.flink.core.fs.FileSystem#listStatus()