org.apache.flink.runtime.highavailability.RunningJobsRegistry Java Examples

The following examples show how to use org.apache.flink.runtime.highavailability.RunningJobsRegistry. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Dispatcher.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private CompletableFuture<Boolean> tryRunRecoveredJobGraph(JobGraph jobGraph, DispatcherId dispatcherId) throws Exception {
	if (leaderElectionService.hasLeadership(dispatcherId.toUUID())) {
		final JobID jobId = jobGraph.getJobID();
		if (jobManagerRunnerFutures.containsKey(jobId)) {
			// we must not release the job graph lock since it can only be locked once and
			// is currently being executed. Once we support multiple locks, we must release
			// the JobGraph here
			log.debug("Ignore added JobGraph because the job {} is already running.", jobId);
			return CompletableFuture.completedFuture(true);
		} else if (runningJobsRegistry.getJobSchedulingStatus(jobId) != RunningJobsRegistry.JobSchedulingStatus.DONE) {
			return waitForTerminatingJobManager(jobId, jobGraph, this::runJob).thenApply(ignored -> true);
		} else {
			log.debug("Ignore added JobGraph because the job {} has already been completed.", jobId);
		}
	}

	return CompletableFuture.completedFuture(false);
}
 
Example #2
Source File: Dispatcher.java    From flink with Apache License 2.0 6 votes vote down vote up
private CompletableFuture<Boolean> tryRunRecoveredJobGraph(JobGraph jobGraph, DispatcherId dispatcherId) throws Exception {
	if (leaderElectionService.hasLeadership(dispatcherId.toUUID())) {
		final JobID jobId = jobGraph.getJobID();
		if (jobManagerRunnerFutures.containsKey(jobId)) {
			// we must not release the job graph lock since it can only be locked once and
			// is currently being executed. Once we support multiple locks, we must release
			// the JobGraph here
			log.debug("Ignore added JobGraph because the job {} is already running.", jobId);
			return CompletableFuture.completedFuture(true);
		} else if (runningJobsRegistry.getJobSchedulingStatus(jobId) != RunningJobsRegistry.JobSchedulingStatus.DONE) {
			return waitForTerminatingJobManager(jobId, jobGraph, this::runJob).thenApply(ignored -> true);
		} else {
			log.debug("Ignore added JobGraph because the job {} has already been completed.", jobId);
		}
	}

	return CompletableFuture.completedFuture(false);
}
 
Example #3
Source File: ZooKeeperHaServicesTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private void runCleanupTest(
		Configuration configuration,
		TestingBlobStoreService blobStoreService,
		ThrowingConsumer<ZooKeeperHaServices, Exception> zooKeeperHaServicesConsumer) throws Exception {
	try (ZooKeeperHaServices zooKeeperHaServices = new ZooKeeperHaServices(
		ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		blobStoreService)) {

		// create some Zk services to trigger the generation of paths
		final LeaderRetrievalService resourceManagerLeaderRetriever = zooKeeperHaServices.getResourceManagerLeaderRetriever();
		final LeaderElectionService resourceManagerLeaderElectionService = zooKeeperHaServices.getResourceManagerLeaderElectionService();
		final RunningJobsRegistry runningJobsRegistry = zooKeeperHaServices.getRunningJobsRegistry();

		final TestingListener listener = new TestingListener();
		resourceManagerLeaderRetriever.start(listener);
		resourceManagerLeaderElectionService.start(new TestingContender("foobar", resourceManagerLeaderElectionService));
		final JobID jobId = new JobID();
		runningJobsRegistry.setJobRunning(jobId);

		listener.waitForNewLeader(2000L);

		resourceManagerLeaderRetriever.stop();
		resourceManagerLeaderElectionService.stop();
		runningJobsRegistry.clearJob(jobId);

		zooKeeperHaServicesConsumer.accept(zooKeeperHaServices);
	}
}
 
Example #4
Source File: AbstractYarnNonHaServices.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() throws IOException {
	enter();
	try {
		// IMPORTANT: The registry must NOT place its data in a directory that is
		// cleaned up by these services.
		return new FsNegativeRunningJobsRegistry(flinkFileSystem, workingDirectory);
	}
	finally {
		exit();
	}
}
 
Example #5
Source File: ZooKeeperRegistryTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the function of ZookeeperRegistry, setJobRunning(), setJobFinished(), isJobRunning()
 */
@Test
public void testZooKeeperRegistry() throws Exception {
	Configuration configuration = new Configuration();
	configuration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, testingServer.getConnectString());
	configuration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");

	final HighAvailabilityServices zkHaService = new ZooKeeperHaServices(
			ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		new VoidBlobStore());

	final RunningJobsRegistry zkRegistry = zkHaService.getRunningJobsRegistry();

	try {
		JobID jobID = JobID.generate();
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobRunning(jobID);
		assertEquals(JobSchedulingStatus.RUNNING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobFinished(jobID);
		assertEquals(JobSchedulingStatus.DONE, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.clearJob(jobID);
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));
	} finally {
		zkHaService.close();
	}
}
 
Example #6
Source File: AbstractNonHaServices.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() throws Exception {
	synchronized (lock) {
		checkNotShutdown();

		return runningJobsRegistry;
	}
}
 
Example #7
Source File: Dispatcher.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Checks whether the given job has already been submitted or executed.
 *
 * @param jobId identifying the submitted job
 * @return true if the job has already been submitted (is running) or has been executed
 * @throws FlinkException if the job scheduling status cannot be retrieved
 */
private boolean isDuplicateJob(JobID jobId) throws FlinkException {
	final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus;

	try {
		jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId);
	} catch (IOException e) {
		throw new FlinkException(String.format("Failed to retrieve job scheduling status for job %s.", jobId), e);
	}

	return jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunnerFutures.containsKey(jobId);
}
 
Example #8
Source File: ZooKeeperHaServicesTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private void runCleanupTest(
		Configuration configuration,
		TestingBlobStoreService blobStoreService,
		ThrowingConsumer<ZooKeeperHaServices, Exception> zooKeeperHaServicesConsumer) throws Exception {
	try (ZooKeeperHaServices zooKeeperHaServices = new ZooKeeperHaServices(
		ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		blobStoreService)) {

		// create some Zk services to trigger the generation of paths
		final LeaderRetrievalService resourceManagerLeaderRetriever = zooKeeperHaServices.getResourceManagerLeaderRetriever();
		final LeaderElectionService resourceManagerLeaderElectionService = zooKeeperHaServices.getResourceManagerLeaderElectionService();
		final RunningJobsRegistry runningJobsRegistry = zooKeeperHaServices.getRunningJobsRegistry();

		final TestingListener listener = new TestingListener();
		resourceManagerLeaderRetriever.start(listener);
		resourceManagerLeaderElectionService.start(new TestingContender("foobar", resourceManagerLeaderElectionService));
		final JobID jobId = new JobID();
		runningJobsRegistry.setJobRunning(jobId);

		listener.waitForNewLeader(2000L);

		resourceManagerLeaderRetriever.stop();
		resourceManagerLeaderElectionService.stop();
		runningJobsRegistry.clearJob(jobId);

		zooKeeperHaServicesConsumer.accept(zooKeeperHaServices);
	}
}
 
Example #9
Source File: ZooKeeperRegistryTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the function of ZookeeperRegistry, setJobRunning(), setJobFinished(), isJobRunning()
 */
@Test
public void testZooKeeperRegistry() throws Exception {
	Configuration configuration = new Configuration();
	configuration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, testingServer.getConnectString());
	configuration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");

	final HighAvailabilityServices zkHaService = new ZooKeeperHaServices(
			ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		new VoidBlobStore());

	final RunningJobsRegistry zkRegistry = zkHaService.getRunningJobsRegistry();

	try {
		JobID jobID = JobID.generate();
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobRunning(jobID);
		assertEquals(JobSchedulingStatus.RUNNING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobFinished(jobID);
		assertEquals(JobSchedulingStatus.DONE, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.clearJob(jobID);
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));
	} finally {
		zkHaService.close();
	}
}
 
Example #10
Source File: DispatcherTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that a reelected Dispatcher can recover jobs.
 */
@Test
public void testJobRecovery() throws Exception {
	dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(TEST_JOB_ID, createdJobManagerRunnerLatch));

	final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);

	// elect the initial dispatcher as the leader
	dispatcherLeaderElectionService.isLeader(UUID.randomUUID()).get();

	// submit the job to the current leader
	dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();

	// check that the job has been persisted
	assertThat(submittedJobGraphStore.getJobIds(), contains(jobGraph.getJobID()));

	jobMasterLeaderElectionService.isLeader(UUID.randomUUID()).get();

	assertThat(runningJobsRegistry.getJobSchedulingStatus(jobGraph.getJobID()), is(RunningJobsRegistry.JobSchedulingStatus.RUNNING));

	// revoke the leadership which will stop all currently running jobs
	dispatcherLeaderElectionService.notLeader();

	// re-grant the leadership, this should trigger the job recovery
	dispatcherLeaderElectionService.isLeader(UUID.randomUUID()).get();

	// wait until we have recovered the job
	createdJobManagerRunnerLatch.await();

	// check whether the job has been recovered
	final Collection<JobID> jobIds = dispatcherGateway.listJobs(TIMEOUT).get();

	assertThat(jobIds, hasSize(1));
	assertThat(jobIds, contains(jobGraph.getJobID()));
}
 
Example #11
Source File: AbstractNonHaServices.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() throws Exception {
	synchronized (lock) {
		checkNotShutdown();

		return runningJobsRegistry;
	}
}
 
Example #12
Source File: Dispatcher.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Checks whether the given job has already been submitted or executed.
 *
 * @param jobId identifying the submitted job
 * @return true if the job has already been submitted (is running) or has been executed
 * @throws FlinkException if the job scheduling status cannot be retrieved
 */
private boolean isDuplicateJob(JobID jobId) throws FlinkException {
	final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus;

	try {
		jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId);
	} catch (IOException e) {
		throw new FlinkException(String.format("Failed to retrieve job scheduling status for job %s.", jobId), e);
	}

	return jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunnerFutures.containsKey(jobId);
}
 
Example #13
Source File: AbstractYarnNonHaServices.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() throws IOException {
	enter();
	try {
		// IMPORTANT: The registry must NOT place its data in a directory that is
		// cleaned up by these services.
		return new FsNegativeRunningJobsRegistry(flinkFileSystem, workingDirectory);
	}
	finally {
		exit();
	}
}
 
Example #14
Source File: ZooKeeperHaServicesTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void runCleanupTest(
		Configuration configuration,
		TestingBlobStoreService blobStoreService,
		ThrowingConsumer<ZooKeeperHaServices, Exception> zooKeeperHaServicesConsumer) throws Exception {
	try (ZooKeeperHaServices zooKeeperHaServices = new ZooKeeperHaServices(
		ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		blobStoreService)) {

		// create some Zk services to trigger the generation of paths
		final LeaderRetrievalService resourceManagerLeaderRetriever = zooKeeperHaServices.getResourceManagerLeaderRetriever();
		final LeaderElectionService resourceManagerLeaderElectionService = zooKeeperHaServices.getResourceManagerLeaderElectionService();
		final RunningJobsRegistry runningJobsRegistry = zooKeeperHaServices.getRunningJobsRegistry();

		final TestingListener listener = new TestingListener();
		resourceManagerLeaderRetriever.start(listener);
		resourceManagerLeaderElectionService.start(new TestingContender("foobar", resourceManagerLeaderElectionService));
		final JobID jobId = new JobID();
		runningJobsRegistry.setJobRunning(jobId);

		listener.waitForNewLeader(2000L);

		resourceManagerLeaderRetriever.stop();
		resourceManagerLeaderElectionService.stop();
		runningJobsRegistry.clearJob(jobId);

		zooKeeperHaServicesConsumer.accept(zooKeeperHaServices);
	}
}
 
Example #15
Source File: ZooKeeperRegistryTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the function of ZookeeperRegistry, setJobRunning(), setJobFinished(), isJobRunning()
 */
@Test
public void testZooKeeperRegistry() throws Exception {
	Configuration configuration = new Configuration();
	configuration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, testingServer.getConnectString());
	configuration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");

	final HighAvailabilityServices zkHaService = new ZooKeeperHaServices(
			ZooKeeperUtils.startCuratorFramework(configuration),
		Executors.directExecutor(),
		configuration,
		new VoidBlobStore());

	final RunningJobsRegistry zkRegistry = zkHaService.getRunningJobsRegistry();

	try {
		JobID jobID = JobID.generate();
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobRunning(jobID);
		assertEquals(JobSchedulingStatus.RUNNING, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.setJobFinished(jobID);
		assertEquals(JobSchedulingStatus.DONE, zkRegistry.getJobSchedulingStatus(jobID));

		zkRegistry.clearJob(jobID);
		assertEquals(JobSchedulingStatus.PENDING, zkRegistry.getJobSchedulingStatus(jobID));
	} finally {
		zkHaService.close();
	}
}
 
Example #16
Source File: DispatcherTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that a reelected Dispatcher can recover jobs.
 */
@Test
public void testJobRecovery() throws Exception {
	dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(TEST_JOB_ID, createdJobManagerRunnerLatch));

	final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);

	// elect the initial dispatcher as the leader
	dispatcherLeaderElectionService.isLeader(UUID.randomUUID()).get();

	// submit the job to the current leader
	dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();

	// check that the job has been persisted
	assertThat(submittedJobGraphStore.getJobIds(), contains(jobGraph.getJobID()));

	jobMasterLeaderElectionService.isLeader(UUID.randomUUID()).get();

	assertThat(runningJobsRegistry.getJobSchedulingStatus(jobGraph.getJobID()), is(RunningJobsRegistry.JobSchedulingStatus.RUNNING));

	// revoke the leadership which will stop all currently running jobs
	dispatcherLeaderElectionService.notLeader();

	// re-grant the leadership, this should trigger the job recovery
	dispatcherLeaderElectionService.isLeader(UUID.randomUUID()).get();

	// wait until we have recovered the job
	createdJobManagerRunnerLatch.await();

	// check whether the job has been recovered
	final Collection<JobID> jobIds = dispatcherGateway.listJobs(TIMEOUT).get();

	assertThat(jobIds, hasSize(1));
	assertThat(jobIds, contains(jobGraph.getJobID()));
}
 
Example #17
Source File: AbstractNonHaServices.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() throws Exception {
	synchronized (lock) {
		checkNotShutdown();

		return runningJobsRegistry;
	}
}
 
Example #18
Source File: Dispatcher.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Checks whether the given job has already been submitted or executed.
 *
 * @param jobId identifying the submitted job
 * @return true if the job has already been submitted (is running) or has been executed
 * @throws FlinkException if the job scheduling status cannot be retrieved
 */
private boolean isDuplicateJob(JobID jobId) throws FlinkException {
	final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus;

	try {
		jobSchedulingStatus = runningJobsRegistry.getJobSchedulingStatus(jobId);
	} catch (IOException e) {
		throw new FlinkException(String.format("Failed to retrieve job scheduling status for job %s.", jobId), e);
	}

	return jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE || jobManagerRunnerFutures.containsKey(jobId);
}
 
Example #19
Source File: ZooKeeperHaServices.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() {
	return runningJobsRegistry;
}
 
Example #20
Source File: ZooKeeperHaServices.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() {
	return runningJobsRegistry;
}
 
Example #21
Source File: ZooKeeperHaServices.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public RunningJobsRegistry getRunningJobsRegistry() {
	return runningJobsRegistry;
}