Java Code Examples for org.apache.hadoop.fs.FileStatus#getModificationTime()
The following examples show how to use
org.apache.hadoop.fs.FileStatus#getModificationTime() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HadoopFileSystem.java From jsr203-hadoop with Apache License 2.0 | 6 votes |
public void setTimes(byte[] bs, FileTime mtime, FileTime atime, FileTime ctime) throws IOException { org.apache.hadoop.fs.Path hp = new HadoopPath(this, bs).getRawResolvedPath(); long mtime_millis = 0; long atime_millis = 0; // Get actual value if (mtime == null || atime == null) { FileStatus stat = this.fs.getFileStatus(hp); mtime_millis = stat.getModificationTime(); atime_millis = stat.getAccessTime(); } if (mtime != null) { mtime_millis = mtime.toMillis(); } if (atime != null) { atime_millis = atime.toMillis(); } this.fs.setTimes(hp, mtime_millis, atime_millis); }
Example 2
Source File: FSDownload.java From hadoop with Apache License 2.0 | 6 votes |
private Path copy(Path sCopy, Path dstdir) throws IOException { FileSystem sourceFs = sCopy.getFileSystem(conf); Path dCopy = new Path(dstdir, "tmp_"+sCopy.getName()); FileStatus sStat = sourceFs.getFileStatus(sCopy); if (sStat.getModificationTime() != resource.getTimestamp()) { throw new IOException("Resource " + sCopy + " changed on src filesystem (expected " + resource.getTimestamp() + ", was " + sStat.getModificationTime()); } if (resource.getVisibility() == LocalResourceVisibility.PUBLIC) { if (!isPublic(sourceFs, sCopy, sStat, statCache)) { throw new IOException("Resource " + sCopy + " is not publicly accessable and as such cannot be part of the" + " public cache."); } } FileUtil.copy(sourceFs, sStat, FileSystem.getLocal(conf), dCopy, false, true, conf); return dCopy; }
Example 3
Source File: AggregatedLogDeletionService.java From hadoop with Apache License 2.0 | 6 votes |
private static boolean shouldDeleteLogDir(FileStatus dir, long cutoffMillis, FileSystem fs) { boolean shouldDelete = true; try { for(FileStatus node: fs.listStatus(dir.getPath())) { if(node.getModificationTime() >= cutoffMillis) { shouldDelete = false; break; } } } catch(IOException e) { logIOException("Error reading the contents of " + dir.getPath(), e); shouldDelete = false; } return shouldDelete; }
Example 4
Source File: FileSplitterInput.java From attic-apex-malhar with Apache License 2.0 | 5 votes |
private void scan(Path filePath, Path rootPath, Map<String, Long> lastModifiedTimesForInputDir) throws IOException { FileStatus parentStatus = fs.getFileStatus(filePath); String parentPathStr = filePath.toUri().getPath(); LOG.debug("scan {}", parentPathStr); FileStatus[] childStatuses = fs.listStatus(filePath); if (childStatuses.length == 0 && rootPath == null && (lastModifiedTimesForInputDir == null || lastModifiedTimesForInputDir.get(parentPathStr) == null)) { // empty input directory copy as is ScannedFileInfo info = new ScannedFileInfo(null, filePath.toString(), parentStatus.getModificationTime()); processDiscoveredFile(info); } for (FileStatus childStatus : childStatuses) { Path childPath = childStatus.getPath(); String childPathStr = childPath.toUri().getPath(); if (childStatus.isDirectory() && isRecursive()) { addToDiscoveredFiles(rootPath, parentStatus, childStatus, lastModifiedTimesForInputDir); scan(childPath, rootPath == null ? parentStatus.getPath() : rootPath, lastModifiedTimesForInputDir); } else if (acceptFile(childPathStr)) { addToDiscoveredFiles(rootPath, parentStatus, childStatus, lastModifiedTimesForInputDir); } else { // don't look at it again ignoredFiles.add(childPathStr); } } }
Example 5
Source File: HdfsDirectory.java From incubator-retired-blur with Apache License 2.0 | 5 votes |
protected long fileModified(String name) throws IOException { Path path = getPath(name); Tracer trace = Trace.trace("filesystem - fileModified", Trace.param("path", path)); try { FileStatus fileStatus = _fileSystem.getFileStatus(path); if (_useCache) { _fileStatusCache.putFStat(name, new FStat(fileStatus)); } return fileStatus.getModificationTime(); } finally { trace.done(); } }
Example 6
Source File: HDFSResourceStore.java From kylin with Apache License 2.0 | 5 votes |
@Override protected RawResource getResourceImpl(String resPath) throws IOException { Path p = getRealHDFSPath(resPath); if (fs.exists(p) && fs.isFile(p)) { FileStatus fileStatus = fs.getFileStatus(p); if (fileStatus.getLen() == 0) { logger.warn("Zero length file: {}. ", p); } FSDataInputStream in = fs.open(p); long ts = fileStatus.getModificationTime(); return new RawResource(resPath, ts, in); } else { return null; } }
Example 7
Source File: MyClient.java From yarn-beginners-examples with Apache License 2.0 | 5 votes |
private Map<String, String> getAMEnvironment(Map<String, LocalResource> localResources , FileSystem fs) throws IOException{ Map<String, String> env = new HashMap<String, String>(); // Set ApplicationMaster jar file LocalResource appJarResource = localResources.get(Constants.AM_JAR_NAME); Path hdfsAppJarPath = new Path(fs.getHomeDirectory(), appJarResource.getResource().getFile()); FileStatus hdfsAppJarStatus = fs.getFileStatus(hdfsAppJarPath); long hdfsAppJarLength = hdfsAppJarStatus.getLen(); long hdfsAppJarTimestamp = hdfsAppJarStatus.getModificationTime(); env.put(Constants.AM_JAR_PATH, hdfsAppJarPath.toString()); env.put(Constants.AM_JAR_TIMESTAMP, Long.toString(hdfsAppJarTimestamp)); env.put(Constants.AM_JAR_LENGTH, Long.toString(hdfsAppJarLength)); // Add AppMaster.jar location to classpath // At some point we should not be required to add // the hadoop specific classpaths to the env. // It should be provided out of the box. // For now setting all required classpaths including // the classpath to "." for the application jar StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$$()) .append(ApplicationConstants.CLASS_PATH_SEPARATOR).append("./*"); for (String c : conf.getStrings( YarnConfiguration.YARN_APPLICATION_CLASSPATH, YarnConfiguration.DEFAULT_YARN_CROSS_PLATFORM_APPLICATION_CLASSPATH)) { classPathEnv.append(ApplicationConstants.CLASS_PATH_SEPARATOR); classPathEnv.append(c.trim()); } env.put("CLASSPATH", classPathEnv.toString()); return env; }
Example 8
Source File: SharedCacheUploader.java From hadoop with Apache License 2.0 | 5 votes |
/** * Checks that the (original) remote file is either owned by the user who * started the app or public. */ @VisibleForTesting boolean verifyAccess() throws IOException { // if it is in the public cache, it's trivially OK if (resource.getVisibility() == LocalResourceVisibility.PUBLIC) { return true; } final Path remotePath; try { remotePath = ConverterUtils.getPathFromYarnURL(resource.getResource()); } catch (URISyntaxException e) { throw new IOException("Invalid resource", e); } // get the file status of the HDFS file FileSystem remoteFs = remotePath.getFileSystem(conf); FileStatus status = remoteFs.getFileStatus(remotePath); // check to see if the file has been modified in any way if (status.getModificationTime() != resource.getTimestamp()) { LOG.warn("The remote file " + remotePath + " has changed since it's localized; will not consider it for upload"); return false; } // check for the user ownership if (status.getOwner().equals(user)) { return true; // the user owns the file } // check if the file is publicly readable otherwise return fileIsPublic(remotePath, remoteFs, status); }
Example 9
Source File: HadoopConnectingFileSystemProvider.java From CloverETL-Engine with GNU Lesser General Public License v2.1 | 5 votes |
@Override public HadoopFileStatus getExtendedStatus(URI path) throws IOException { checkConnected(); FileStatus status = dfs.getFileStatus(new Path(path)); return new HadoopFileStatus(status.getPath().toUri(), status.getLen(), status.isDir(), status.getModificationTime(), status.getBlockSize(), status.getGroup(), status.getOwner(), status.getReplication()); }
Example 10
Source File: HdfsSortedOplogOrganizer.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
/** * @param ts * target timestamp * @return list of hoplogs, whose expiry markers were created before target * timestamp, and the expiry marker itself. * @throws IOException */ protected List<FileStatus> getOptimizationTargets(long ts) throws IOException { if (logger.finerEnabled()) { logger.finer("Identifying optimization targets " + ts); } List<FileStatus> deleteTargets = new ArrayList<FileStatus>(); FileStatus[] markers = getExpiryMarkers(); if (markers != null) { for (FileStatus marker : markers) { String name = truncateExpiryExtension(marker.getPath().getName()); long timestamp = marker.getModificationTime(); // expired minor compacted files are not being used anywhere. These can // be removed immediately. All the other expired files should be removed // when the files have aged boolean isTarget = false; if (name.endsWith(MINOR_HOPLOG_EXTENSION)) { isTarget = true; } else if (timestamp < ts && name.endsWith(FLUSH_HOPLOG_EXTENSION)) { isTarget = true; } else if (timestamp < ts && name.endsWith(MAJOR_HOPLOG_EXTENSION)) { HDFSCompactionConfig compactionConf = store.getHDFSCompactionConfig(); long majorCInterval = ((long)compactionConf.getMajorCompactionIntervalMins()) * 60 * 1000; if (timestamp < (System.currentTimeMillis() - majorCInterval)) { isTarget = true; } } if (!isTarget) { continue; } // if the file is still being read, do not delete or rename it TrackedReference<Hoplog> used = hoplogReadersController.getInactiveHoplog(name); if (used != null) { if (used.inUse() && logger.fineEnabled()) { logger.fine("Optimizer: found active expired hoplog:" + name); } else if (logger.fineEnabled()) { logger.fine("Optimizer: found open expired hoplog:" + name); } continue; } if (logger.fineEnabled()) { logger.fine("Delete target identified " + marker.getPath()); } deleteTargets.add(marker); Path hoplogPath = new Path(bucketPath, name); if (store.getFileSystem().exists(hoplogPath)) { FileStatus hoplog = store.getFileSystem().getFileStatus(hoplogPath); deleteTargets.add(hoplog); } } } return deleteTargets; }
Example 11
Source File: NativeAzureFileSystemBaseTest.java From hadoop with Apache License 2.0 | 4 votes |
private boolean testModifiedTime(Path testPath, long time) throws Exception { FileStatus fileStatus = fs.getFileStatus(testPath); final long errorMargin = modifiedTimeErrorMargin; long lastModified = fileStatus.getModificationTime(); return (lastModified > (time - errorMargin) && lastModified < (time + errorMargin)); }
Example 12
Source File: TestHFileCleaner.java From hbase with Apache License 2.0 | 4 votes |
/** * @param file to check * @return loggable information about the file */ private String getFileStats(Path file, FileSystem fs) throws IOException { FileStatus status = fs.getFileStatus(file); return "File" + file + ", mtime:" + status.getModificationTime() + ", atime:" + status.getAccessTime(); }
Example 13
Source File: TestSetTimes.java From big-c with Apache License 2.0 | 4 votes |
/** * Tests mod time change at close in DFS. */ @Test public void testTimesAtClose() throws IOException { Configuration conf = new HdfsConfiguration(); final int MAX_IDLE_TIME = 2000; // 2s int replicas = 1; // parameter initialization conf.setInt("ipc.client.connection.maxidletime", MAX_IDLE_TIME); conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 1000); conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1); conf.setInt(DFSConfigKeys.DFS_DATANODE_HANDLER_COUNT_KEY, 50); MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf) .numDataNodes(numDatanodes) .build(); cluster.waitActive(); InetSocketAddress addr = new InetSocketAddress("localhost", cluster.getNameNodePort()); DFSClient client = new DFSClient(addr, conf); DatanodeInfo[] info = client.datanodeReport(DatanodeReportType.LIVE); assertEquals("Number of Datanodes ", numDatanodes, info.length); FileSystem fileSys = cluster.getFileSystem(); assertTrue(fileSys instanceof DistributedFileSystem); try { // create a new file and write to it Path file1 = new Path("/simple.dat"); FSDataOutputStream stm = writeFile(fileSys, file1, replicas); System.out.println("Created and wrote file simple.dat"); FileStatus statBeforeClose = fileSys.getFileStatus(file1); long mtimeBeforeClose = statBeforeClose.getModificationTime(); String mdateBeforeClose = dateForm.format(new Date( mtimeBeforeClose)); System.out.println("mtime on " + file1 + " before close is " + mdateBeforeClose + " (" + mtimeBeforeClose + ")"); assertTrue(mtimeBeforeClose != 0); //close file after writing stm.close(); System.out.println("Closed file."); FileStatus statAfterClose = fileSys.getFileStatus(file1); long mtimeAfterClose = statAfterClose.getModificationTime(); String mdateAfterClose = dateForm.format(new Date(mtimeAfterClose)); System.out.println("mtime on " + file1 + " after close is " + mdateAfterClose + " (" + mtimeAfterClose + ")"); assertTrue(mtimeAfterClose != 0); assertTrue(mtimeBeforeClose != mtimeAfterClose); cleanupFile(fileSys, file1); } catch (IOException e) { info = client.datanodeReport(DatanodeReportType.ALL); printDatanodeReport(info); throw e; } finally { fileSys.close(); cluster.shutdown(); } }
Example 14
Source File: PurgeMonitor.java From RDFS with Apache License 2.0 | 4 votes |
protected static float usefulHar( Codec codec, FileSystem srcFs, FileSystem parityFs, Path harPath, String parityPrefix, Configuration conf, PlacementMonitor placementMonitor) throws IOException { HarIndex harIndex = HarIndex.getHarIndex(parityFs, harPath); Iterator<HarIndex.IndexEntry> entryIt = harIndex.getEntries(); int numUseless = 0; int filesInHar = 0; while (entryIt.hasNext()) { HarIndex.IndexEntry entry = entryIt.next(); filesInHar++; if (!entry.fileName.startsWith(parityPrefix)) { continue; } String src = entry.fileName.substring(parityPrefix.length()); if (existsBetterParityFile(codec, new Path(src), conf)) { numUseless += 1; continue; } try { FileStatus srcStatus = srcFs.getFileStatus(new Path(src)); if (srcStatus == null) { numUseless++; } else if (entry.mtime != srcStatus.getModificationTime()) { numUseless++; } else { // This parity file in this HAR is good. if (placementMonitor != null) { // Check placement. placementMonitor.checkFile( srcFs, srcStatus, parityFs, harIndex.partFilePath(entry), entry, codec); } if (LOG.isDebugEnabled()) { LOG.debug("Useful file " + entry.fileName); } } } catch (FileNotFoundException e) { numUseless++; } } if (filesInHar == 0) { return 0; } float uselessPercent = numUseless * 100.0f / filesInHar; return 100 - uselessPercent; }
Example 15
Source File: GetHDFS.java From localization_nifi with Apache License 2.0 | 4 votes |
/** * Poll HDFS for files to process that match the configured file filters. * * @param hdfs hdfs * @param dir dir * @param filesVisited filesVisited * @return files to process * @throws java.io.IOException ex */ protected Set<Path> selectFiles(final FileSystem hdfs, final Path dir, Set<Path> filesVisited) throws IOException { if (null == filesVisited) { filesVisited = new HashSet<>(); } if (!hdfs.exists(dir)) { throw new IOException("Selection directory " + dir.toString() + " doesn't appear to exist!"); } final Set<Path> files = new HashSet<>(); for (final FileStatus file : hdfs.listStatus(dir)) { if (files.size() >= MAX_WORKING_QUEUE_SIZE) { // no need to make the files set larger than what we would queue anyway break; } final Path canonicalFile = file.getPath(); if (!filesVisited.add(canonicalFile)) { // skip files we've already seen (may be looping directory links) continue; } if (file.isDirectory() && processorConfig.getRecurseSubdirs()) { files.addAll(selectFiles(hdfs, canonicalFile, filesVisited)); } else if (!file.isDirectory() && processorConfig.getPathFilter(dir).accept(canonicalFile)) { final long fileAge = System.currentTimeMillis() - file.getModificationTime(); if (processorConfig.getMinimumAge() < fileAge && fileAge < processorConfig.getMaximumAge()) { files.add(canonicalFile); if (getLogger().isDebugEnabled()) { getLogger().debug(this + " selected file at path: " + canonicalFile.toString()); } } } } return files; }
Example 16
Source File: HdfsSortedOplogOrganizer.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
/** * @param ts * target timestamp * @return list of hoplogs, whose expiry markers were created before target * timestamp, and the expiry marker itself. * @throws IOException */ protected List<FileStatus> getOptimizationTargets(long ts) throws IOException { if (logger.finerEnabled()) { logger.finer("Identifying optimization targets " + ts); } List<FileStatus> deleteTargets = new ArrayList<FileStatus>(); FileStatus[] markers = getExpiryMarkers(); if (markers != null) { for (FileStatus marker : markers) { String name = truncateExpiryExtension(marker.getPath().getName()); long timestamp = marker.getModificationTime(); // expired minor compacted files are not being used anywhere. These can // be removed immediately. All the other expired files should be removed // when the files have aged boolean isTarget = false; if (name.endsWith(MINOR_HOPLOG_EXTENSION)) { isTarget = true; } else if (timestamp < ts && name.endsWith(FLUSH_HOPLOG_EXTENSION)) { isTarget = true; } else if (timestamp < ts && name.endsWith(MAJOR_HOPLOG_EXTENSION)) { HDFSCompactionConfig compactionConf = store.getHDFSCompactionConfig(); long majorCInterval = ((long)compactionConf.getMajorCompactionIntervalMins()) * 60 * 1000; if (timestamp < (System.currentTimeMillis() - majorCInterval)) { isTarget = true; } } if (!isTarget) { continue; } // if the file is still being read, do not delete or rename it TrackedReference<Hoplog> used = hoplogReadersController.getInactiveHoplog(name); if (used != null) { if (used.inUse() && logger.fineEnabled()) { logger.fine("Optimizer: found active expired hoplog:" + name); } else if (logger.fineEnabled()) { logger.fine("Optimizer: found open expired hoplog:" + name); } continue; } if (logger.fineEnabled()) { logger.fine("Delete target identified " + marker.getPath()); } deleteTargets.add(marker); Path hoplogPath = new Path(bucketPath, name); if (store.getFileSystem().exists(hoplogPath)) { FileStatus hoplog = store.getFileSystem().getFileStatus(hoplogPath); deleteTargets.add(hoplog); } } } return deleteTargets; }
Example 17
Source File: RaidNode.java From RDFS with Apache License 2.0 | 4 votes |
/** * Returns a list of pathnames that needs raiding. The list of paths * could be obtained by resuming a previously suspended traversal. The * number of paths returned is limited by raid.distraid.max.jobs. */ private List<FileStatus> selectFiles(PolicyInfo info, ArrayList<PolicyInfo> allPolicies) throws IOException { String policyName = info.getName(); long modTimePeriod = 0; try { modTimePeriod = Long.parseLong(info.getProperty("modTimePeriod")); } catch (NumberFormatException nfe) { //set modeTimePeriod to default: 1 minute modTimePeriod = 60000; } // Max number of files returned. int selectLimit = configMgr.getMaxFilesPerJob(); PolicyState scanState = policyStateMap.get(policyName); List<FileStatus> returnSet = new ArrayList<FileStatus>(selectLimit); DirectoryTraversal traversal; if (scanState.isScanInProgress()) { LOG.info("Resuming traversal for policy " + policyName); traversal = scanState.pendingTraversal; } else { LOG.info("Start new traversal for policy " + policyName); scanState.startTime = now(); if (!Codec.getCodec(info.getCodecId()).isDirRaid) { traversal = DirectoryTraversal.raidFileRetriever(info, info.getSrcPathExpanded(), allPolicies, conf, directoryTraversalThreads, directoryTraversalShuffle, true); } else { traversal = DirectoryTraversal.raidLeafDirectoryRetriever( info, info.getSrcPathExpanded(), allPolicies, conf, directoryTraversalThreads, directoryTraversalShuffle, true); } scanState.setTraversal(traversal); } FileStatus f; while ((f = traversal.next()) != DirectoryTraversal.FINISH_TOKEN) { long modTime = System.currentTimeMillis() - f.getModificationTime(); if(modTime > modTimePeriod) { returnSet.add(f); if (returnSet.size() == selectLimit) { return returnSet; } } } scanState.resetTraversal(); return returnSet; }
Example 18
Source File: HdfsFileSystem.java From datacollector with Apache License 2.0 | 4 votes |
public void addFiles(WrappedFile dirFile, WrappedFile startingFile, List<WrappedFile> toProcess, boolean includeStartingFile, boolean useLastModified) throws IOException { final long scanTime = System.currentTimeMillis(); PathFilter pathFilter = new PathFilter() { @Override public boolean accept(Path entry) { try { FileStatus fileStatus = fs.getFileStatus(entry); if (fileStatus.isDirectory()) { return false; } if(!patternMatches(entry.getName())) { return false; } HdfsFile hdfsFile = new HdfsFile(fs, entry); // SDC-3551: Pick up only files with mtime strictly less than scan time. if (fileStatus.getModificationTime() < scanTime) { if (startingFile == null || startingFile.toString().isEmpty()) { toProcess.add(hdfsFile); } else { int compares = compare(hdfsFile, startingFile, useLastModified); if (includeStartingFile) { if (compares >= 0) { toProcess.add(hdfsFile); } } else { if (compares > 0) { toProcess.add(hdfsFile); } } } } } catch (IOException ex) { LOG.error("Failed to open file {}", entry.toString()); } return false; } }; fs.globStatus(new Path(dirFile.getAbsolutePath(), "*"), pathFilter); }
Example 19
Source File: RaidState.java From RDFS with Apache License 2.0 | 4 votes |
/** * Check the state of a raid source file against a policy * @param info The policy to check * @param file The source file to be checked * @param now The system millisecond time * @param skipParityCheck Skip checking the existence of parity. Checking * parity is very time-consuming for HAR parity file * @param lfs The list of FileStatus of files under the directory, only used * by directory raid. * @return The state of the raid file * @throws IOException */ public RaidState check(PolicyInfo info, FileStatus file, long now, boolean skipParityCheck, List<FileStatus> lfs) throws IOException { ExpandedPolicy matched = null; long mtime = -1; String uriPath = file.getPath().toUri().getPath(); if (inferMTimeFromName) { mtime = mtimeFromName(uriPath); } // If we can't infer the mtime from the name, use the mtime from filesystem. // If the the file is newer than a day, use the mtime from filesystem. if (mtime == -1 || Math.abs(file.getModificationTime() - now) < ONE_DAY_MSEC) { mtime = file.getModificationTime(); } for (ExpandedPolicy policy : sortedExpendedPolicy) { if (policy.parentPolicy == info) { matched = policy; break; } if (policy.match(file, mtime, now, skipParityCheck, conf)) { return NOT_RAIDED_OTHER_POLICY; } } if (matched == null) { return NOT_RAIDED_NO_POLICY; } // The preceding checks are more restrictive, // check for excluded just before parity check. if (shouldExclude(uriPath)) { return NOT_RAIDED_NO_POLICY; } if (file.isDir() != matched.codec.isDirRaid) { return NOT_RAIDED_NO_POLICY; } long blockNum = matched.codec.isDirRaid? DirectoryStripeReader.getBlockNum(lfs): computeNumBlocks(file); if (blockNum <= TOO_SMALL_NOT_RAID_NUM_BLOCKS) { return NOT_RAIDED_TOO_SMALL; } long repl = matched.codec.isDirRaid? DirectoryStripeReader.getReplication(lfs): file.getReplication(); if (repl == matched.targetReplication) { if (skipParityCheck || ParityFilePair.parityExists(file, matched.codec, conf)) { return RAIDED; } } if (now - mtime < matched.modTimePeriod) { return NOT_RAIDED_TOO_NEW; } return NOT_RAIDED_BUT_SHOULD; }
Example 20
Source File: DFSPathSelector.java From hudi with Apache License 2.0 | 4 votes |
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) { try { // obtain all eligible files under root folder. List<FileStatus> eligibleFiles = new ArrayList<>(); RemoteIterator<LocatedFileStatus> fitr = fs.listFiles(new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), true); while (fitr.hasNext()) { LocatedFileStatus fileStatus = fitr.next(); if (fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) { continue; } eligibleFiles.add(fileStatus); } // sort them by modification time. eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime)); // Filter based on checkpoint & input size, if needed long currentBytes = 0; long maxModificationTime = Long.MIN_VALUE; List<FileStatus> filteredFiles = new ArrayList<>(); for (FileStatus f : eligibleFiles) { if (lastCheckpointStr.isPresent() && f.getModificationTime() <= Long.valueOf(lastCheckpointStr.get()).longValue()) { // skip processed files continue; } if (currentBytes + f.getLen() >= sourceLimit) { // we have enough data, we are done break; } maxModificationTime = f.getModificationTime(); currentBytes += f.getLen(); filteredFiles.add(f); } // no data to read if (filteredFiles.size() == 0) { return new ImmutablePair<>(Option.empty(), lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE))); } // read the files out. String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(maxModificationTime)); } catch (IOException ioe) { throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } }