org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException Java Examples

The following examples show how to use org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SlotManagerImpl.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Handles a failed slot request. The slot manager tries to find a new slot fulfilling
 * the resource requirements for the failed slot request.
 *
 * @param slotId identifying the slot which was assigned to the slot request before
 * @param allocationId identifying the failed slot request
 * @param cause of the failure
 */
private void handleFailedSlotRequest(SlotID slotId, AllocationID allocationId, Throwable cause) {
	PendingSlotRequest pendingSlotRequest = pendingSlotRequests.get(allocationId);

	LOG.debug("Slot request with allocation id {} failed for slot {}.", allocationId, slotId, cause);

	if (null != pendingSlotRequest) {
		pendingSlotRequest.setRequestFuture(null);

		try {
			internalRequestSlot(pendingSlotRequest);
		} catch (ResourceManagerException e) {
			pendingSlotRequests.remove(allocationId);

			resourceActions.notifyAllocationFailure(
				pendingSlotRequest.getJobId(),
				allocationId,
				e);
		}
	} else {
		LOG.debug("There was not pending slot request with allocation id {}. Probably the request has been fulfilled or cancelled.", allocationId);
	}
}
 
Example #2
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Fetches framework/worker information persisted by a prior incarnation of the RM.
 */
private CompletableFuture<List<MesosWorkerStore.Worker>> getWorkersAsync() {
	// if this resource manager is recovering from failure,
	// then some worker tasks are most likely still alive and we can re-obtain them
	return CompletableFuture.supplyAsync(() -> {
		try {
			final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();
			for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
				if (worker.state() == MesosWorkerStore.WorkerState.New) {
					// remove new workers because allocation requests are transient
					workerStore.removeWorker(worker.taskID());
				}
			}
			return tasksFromPreviousAttempts;
		} catch (final Exception e) {
			throw new CompletionException(new ResourceManagerException(e));
		}
	}, getRpcService().getExecutor());
}
 
Example #3
Source File: SlotManagerFailUnfulfillableTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnfulfillableRequestsFailWhenOn() {
	// setup
	final ResourceProfile availableProfile = ResourceProfile.fromResources(2.0, 100);
	final ResourceProfile unfulfillableProfile = ResourceProfile.fromResources(2.0, 200);

	final List<Tuple3<JobID, AllocationID, Exception>> notifiedAllocationFailures = new ArrayList<>();
	final SlotManager slotManager = createSlotManagerNotStartingNewTMs(notifiedAllocationFailures);
	registerFreeSlot(slotManager, availableProfile);

	// test
	try {
		slotManager.registerSlotRequest(slotRequest(unfulfillableProfile));
		fail("this should cause an exception");
	} catch (ResourceManagerException exception) {
		assertTrue(ExceptionUtils.findThrowable(exception, UnfulfillableSlotRequestException.class).isPresent());
	}

	// assert
	assertEquals(0, notifiedAllocationFailures.size());
	assertEquals(0, slotManager.getNumberPendingSlotRequests());
}
 
Example #4
Source File: ResourceManagerJobMasterTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Check and verify return RegistrationResponse. Decline when failed to start a
 * job master Leader retrieval listener.
 */
@Test
public void testRegisterJobMasterWithFailureLeaderListener() throws Exception {
	JobID unknownJobIDToHAServices = new JobID();

	// this should fail because we try to register a job leader listener for an unknown job id
	CompletableFuture<RegistrationResponse> registrationFuture = resourceManagerGateway.registerJobManager(
		jobMasterGateway.getFencingToken(),
		jobMasterResourceId,
		jobMasterGateway.getAddress(),
		unknownJobIDToHAServices,
		TIMEOUT);

	try {
		registrationFuture.get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS);
		fail("Expected to fail with a ResourceManagerException.");
	} catch (ExecutionException e) {
		assertTrue(ExceptionUtils.stripExecutionException(e) instanceof ResourceManagerException);
	}

	// ignore the reported error
	testingFatalErrorHandler.clearError();
}
 
Example #5
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public Collection<ResourceProfile> startNewWorker(ResourceProfile resourceProfile) {
	LOG.info("Starting a new worker.");
	try {
		// generate new workers into persistent state and launch associated actors
		MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID(), resourceProfile);
		workerStore.putWorker(worker);
		workersInNew.put(extractResourceID(worker.taskID()), worker);

		LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

		LOG.info("Scheduling Mesos task {} with ({} MB, {} cpus).",
			launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs());

		// tell the task monitor about the new plans
		taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);

		// tell the launch coordinator to launch the new tasks
		launchCoordinator.tell(new LaunchCoordinator.Launch(Collections.singletonList(launchable)), selfActor);

		return slotsPerWorker;
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("Unable to request new workers.", ex));
		return Collections.emptyList();
	}
}
 
Example #6
Source File: SlotManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Tries to allocate a slot for the given slot request. If there is no slot available, the
 * resource manager is informed to allocate more resources and a timeout for the request is
 * registered.
 *
 * @param pendingSlotRequest to allocate a slot for
 * @throws ResourceManagerException if the resource manager cannot allocate more resource
 */
private void internalRequestSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException {
	final ResourceProfile resourceProfile = pendingSlotRequest.getResourceProfile();
	TaskManagerSlot taskManagerSlot = findMatchingSlot(resourceProfile);

	if (taskManagerSlot != null) {
		allocateSlot(taskManagerSlot, pendingSlotRequest);
	} else {
		Optional<PendingTaskManagerSlot> pendingTaskManagerSlotOptional = findFreeMatchingPendingTaskManagerSlot(resourceProfile);

		if (!pendingTaskManagerSlotOptional.isPresent()) {
			pendingTaskManagerSlotOptional = allocateResource(resourceProfile);
		}

		pendingTaskManagerSlotOptional.ifPresent(pendingTaskManagerSlot -> assignPendingTaskManagerSlot(pendingSlotRequest, pendingTaskManagerSlot));
	}
}
 
Example #7
Source File: MesosResourceManager.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Fetches framework/worker information persisted by a prior incarnation of the RM.
 */
private CompletableFuture<List<MesosWorkerStore.Worker>> getWorkersAsync() {
	// if this resource manager is recovering from failure,
	// then some worker tasks are most likely still alive and we can re-obtain them
	return CompletableFuture.supplyAsync(() -> {
		try {
			final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();
			for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
				if (worker.state() == MesosWorkerStore.WorkerState.New) {
					// remove new workers because allocation requests are transient
					workerStore.removeWorker(worker.taskID());
				}
			}
			return tasksFromPreviousAttempts;
		} catch (final Exception e) {
			throw new CompletionException(new ResourceManagerException(e));
		}
	}, getRpcService().getExecutor());
}
 
Example #8
Source File: SlotManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private Optional<PendingTaskManagerSlot> allocateResource(ResourceProfile resourceProfile) throws ResourceManagerException {
	final Collection<ResourceProfile> requestedSlots = resourceActions.allocateResource(resourceProfile);

	if (requestedSlots.isEmpty()) {
		return Optional.empty();
	} else {
		final Iterator<ResourceProfile> slotIterator = requestedSlots.iterator();
		final PendingTaskManagerSlot pendingTaskManagerSlot = new PendingTaskManagerSlot(slotIterator.next());
		pendingSlots.put(pendingTaskManagerSlot.getTaskManagerSlotId(), pendingTaskManagerSlot);

		while (slotIterator.hasNext()) {
			final PendingTaskManagerSlot additionalPendingTaskManagerSlot = new PendingTaskManagerSlot(slotIterator.next());
			pendingSlots.put(additionalPendingTaskManagerSlot.getTaskManagerSlotId(), additionalPendingTaskManagerSlot);
		}

		return Optional.of(pendingTaskManagerSlot);
	}
}
 
Example #9
Source File: SlotManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Handles a failed slot request. The slot manager tries to find a new slot fulfilling
 * the resource requirements for the failed slot request.
 *
 * @param slotId identifying the slot which was assigned to the slot request before
 * @param allocationId identifying the failed slot request
 * @param cause of the failure
 */
private void handleFailedSlotRequest(SlotID slotId, AllocationID allocationId, Throwable cause) {
	PendingSlotRequest pendingSlotRequest = pendingSlotRequests.get(allocationId);

	LOG.debug("Slot request with allocation id {} failed for slot {}.", allocationId, slotId, cause);

	if (null != pendingSlotRequest) {
		pendingSlotRequest.setRequestFuture(null);

		try {
			internalRequestSlot(pendingSlotRequest);
		} catch (ResourceManagerException e) {
			pendingSlotRequests.remove(allocationId);

			resourceActions.notifyAllocationFailure(
				pendingSlotRequest.getJobId(),
				allocationId,
				e);
		}
	} else {
		LOG.debug("There was not pending slot request with allocation id {}. Probably the request has been fulfilled or cancelled.", allocationId);
	}
}
 
Example #10
Source File: ResourceManagerJobMasterTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Check and verify return RegistrationResponse. Decline when failed to start a
 * job master Leader retrieval listener.
 */
@Test
public void testRegisterJobMasterWithFailureLeaderListener() throws Exception {
	JobID unknownJobIDToHAServices = new JobID();

	// this should fail because we try to register a job leader listener for an unknown job id
	CompletableFuture<RegistrationResponse> registrationFuture = resourceManagerGateway.registerJobManager(
		jobMasterGateway.getFencingToken(),
		jobMasterResourceId,
		jobMasterGateway.getAddress(),
		unknownJobIDToHAServices,
		TIMEOUT);

	try {
		registrationFuture.get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS);
		fail("Expected to fail with a ResourceManagerException.");
	} catch (ExecutionException e) {
		assertTrue(ExceptionUtils.stripExecutionException(e) instanceof ResourceManagerException);
	}

	// ignore the reported error
	testingFatalErrorHandler.clearError();
}
 
Example #11
Source File: SlotManagerFailUnfulfillableTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnfulfillableRequestsFailWhenOn() {
	// setup
	final ResourceProfile availableProfile = new ResourceProfile(2.0, 100);
	final ResourceProfile unfulfillableProfile = new ResourceProfile(2.0, 200);

	final List<Tuple3<JobID, AllocationID, Exception>> notifiedAllocationFailures = new ArrayList<>();
	final SlotManager slotManager = createSlotManagerNotStartingNewTMs(notifiedAllocationFailures);
	registerFreeSlot(slotManager, availableProfile);

	// test
	try {
		slotManager.registerSlotRequest(slotRequest(unfulfillableProfile));
		fail("this should cause an exception");
	} catch (ResourceManagerException exception) {
		assertTrue(ExceptionUtils.findThrowable(exception, UnfulfillableSlotRequestException.class).isPresent());
	}

	// assert
	assertEquals(0, notifiedAllocationFailures.size());
	assertEquals(0, slotManager.getNumberPendingSlotRequests());
}
 
Example #12
Source File: ResourceManagerJobMasterTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Check and verify return RegistrationResponse. Decline when failed to start a
 * job master Leader retrieval listener.
 */
@Test
public void testRegisterJobMasterWithFailureLeaderListener() throws Exception {
	JobID unknownJobIDToHAServices = new JobID();

	// this should fail because we try to register a job leader listener for an unknown job id
	CompletableFuture<RegistrationResponse> registrationFuture = resourceManagerGateway.registerJobManager(
		jobMasterGateway.getFencingToken(),
		jobMasterResourceId,
		jobMasterGateway.getAddress(),
		unknownJobIDToHAServices,
		TIMEOUT);

	try {
		registrationFuture.get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS);
		fail("Expected to fail with a ResourceManagerException.");
	} catch (ExecutionException e) {
		assertTrue(ExceptionUtils.stripExecutionException(e) instanceof ResourceManagerException);
	}

	// ignore the reported error
	testingFatalErrorHandler.clearError();
}
 
Example #13
Source File: SlotManagerImpl.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Handles a failed slot request. The slot manager tries to find a new slot fulfilling
 * the resource requirements for the failed slot request.
 *
 * @param slotId identifying the slot which was assigned to the slot request before
 * @param allocationId identifying the failed slot request
 * @param cause of the failure
 */
private void handleFailedSlotRequest(SlotID slotId, AllocationID allocationId, Throwable cause) {
	PendingSlotRequest pendingSlotRequest = pendingSlotRequests.get(allocationId);

	LOG.debug("Slot request with allocation id {} failed for slot {}.", allocationId, slotId, cause);

	if (null != pendingSlotRequest) {
		pendingSlotRequest.setRequestFuture(null);

		try {
			internalRequestSlot(pendingSlotRequest);
		} catch (ResourceManagerException e) {
			pendingSlotRequests.remove(allocationId);

			resourceActions.notifyAllocationFailure(
				pendingSlotRequest.getJobId(),
				allocationId,
				e);
		}
	} else {
		LOG.debug("There was not pending slot request with allocation id {}. Probably the request has been fulfilled or cancelled.", allocationId);
	}
}
 
Example #14
Source File: MesosResourceManager.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Fetches framework/worker information persisted by a prior incarnation of the RM.
 */
private CompletableFuture<List<MesosWorkerStore.Worker>> getWorkersAsync() {
	// if this resource manager is recovering from failure,
	// then some worker tasks are most likely still alive and we can re-obtain them
	return CompletableFuture.supplyAsync(() -> {
		try {
			final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();
			for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
				if (worker.state() == MesosWorkerStore.WorkerState.New) {
					// remove new workers because allocation requests are transient
					workerStore.removeWorker(worker.taskID());
				}
			}
			return tasksFromPreviousAttempts;
		} catch (final Exception e) {
			throw new CompletionException(new ResourceManagerException(e));
		}
	}, getRpcService().getExecutor());
}
 
Example #15
Source File: SlotManagerImpl.java    From flink with Apache License 2.0 6 votes vote down vote up
private void fulfillPendingSlotRequestWithPendingTaskManagerSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException {
	ResourceProfile resourceProfile = pendingSlotRequest.getResourceProfile();
	Optional<PendingTaskManagerSlot> pendingTaskManagerSlotOptional = findFreeMatchingPendingTaskManagerSlot(resourceProfile);

	if (!pendingTaskManagerSlotOptional.isPresent()) {
		pendingTaskManagerSlotOptional = allocateResource(resourceProfile);
	}

	OptionalConsumer.of(pendingTaskManagerSlotOptional)
		.ifPresent(pendingTaskManagerSlot -> assignPendingTaskManagerSlot(pendingSlotRequest, pendingTaskManagerSlot))
		.ifNotPresent(() -> {
			// request can not be fulfilled by any free slot or pending slot that can be allocated,
			// check whether it can be fulfilled by allocated slots
			if (failUnfulfillableRequest && !isFulfillableByRegisteredOrPendingSlots(pendingSlotRequest.getResourceProfile())) {
				throw new UnfulfillableSlotRequestException(pendingSlotRequest.getAllocationId(), pendingSlotRequest.getResourceProfile());
			}
		});
}
 
Example #16
Source File: SlotManagerImpl.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Tries to allocate a slot for the given slot request. If there is no slot available, the
 * resource manager is informed to allocate more resources and a timeout for the request is
 * registered.
 *
 * @param pendingSlotRequest to allocate a slot for
 * @throws ResourceManagerException if the slot request failed or is unfulfillable
 */
private void internalRequestSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException {
	final ResourceProfile resourceProfile = pendingSlotRequest.getResourceProfile();
	TaskManagerSlot taskManagerSlot = findMatchingSlot(resourceProfile);

	if (taskManagerSlot != null) {
		allocateSlot(taskManagerSlot, pendingSlotRequest);
	} else {
		Optional<PendingTaskManagerSlot> pendingTaskManagerSlotOptional = findFreeMatchingPendingTaskManagerSlot(resourceProfile);

		if (!pendingTaskManagerSlotOptional.isPresent()) {
			pendingTaskManagerSlotOptional = allocateResource(resourceProfile);
		}

		if (pendingTaskManagerSlotOptional.isPresent()) {
			assignPendingTaskManagerSlot(pendingSlotRequest, pendingTaskManagerSlotOptional.get());
		}
		else {
			// request can not be fulfilled by any free slot or pending slot that can be allocated,
			// check whether it can be fulfilled by allocated slots
			if (failUnfulfillableRequest && !isFulfillableByRegisteredSlots(pendingSlotRequest.getResourceProfile())) {
				throw new UnfulfillableSlotRequestException(pendingSlotRequest.getAllocationId(), pendingSlotRequest.getResourceProfile());
			}
		}
	}
}
 
Example #17
Source File: SlotManagerImplTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that the slot request fails if we cannot allocate more resources.
 */
@Test
public void testSlotRequestWithResourceAllocationFailure() throws Exception {
	final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
	final ResourceProfile resourceProfile = ResourceProfile.fromResources(42.0, 1337);
	final SlotRequest slotRequest = new SlotRequest(
		new JobID(),
		new AllocationID(),
		resourceProfile,
		"localhost");

	ResourceActions resourceManagerActions = new TestingResourceActionsBuilder()
		.setAllocateResourceFunction(value -> false)
		.build();

	try (SlotManager slotManager = createSlotManager(resourceManagerId, resourceManagerActions)) {

		slotManager.registerSlotRequest(slotRequest);

		fail("The slot request should have failed with a ResourceManagerException.");

	} catch (ResourceManagerException e) {
		// expected exception
	}
}
 
Example #18
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Cleanup application and shut down cluster.
 *
 * @param finalStatus of the Flink application
 * @param diagnostics diagnostics message for the Flink application or {@code null}
 */
@Override
public CompletableFuture<Acknowledge> deregisterApplication(
		final ApplicationStatus finalStatus,
		@Nullable final String diagnostics) {
	log.info("Shut down cluster because application is in {}, diagnostics {}.", finalStatus, diagnostics);

	try {
		internalDeregisterApplication(finalStatus, diagnostics);
	} catch (ResourceManagerException e) {
		log.warn("Could not properly shutdown the application.", e);
	}

	return CompletableFuture.completedFuture(Acknowledge.get());
}
 
Example #19
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Accept offers as advised by the launch coordinator.
 *
 * <p>Acceptance is routed through the RM to update the persistent state before
 * forwarding the message to Mesos.
 */
public void acceptOffers(AcceptOffers msg) {
	try {
		List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());

		// transition the persistent state of some tasks to Launched
		for (Protos.Offer.Operation op : msg.operations()) {
			if (op.getType() == Protos.Offer.Operation.Type.LAUNCH) {
				for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
					MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
					assert (worker != null);

					worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
					workerStore.putWorker(worker);
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);

					LOG.info("Launching Mesos task {} on host {}.",
						worker.taskID().getValue(), worker.hostname().get());

					toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
				}
			}
		}

		// tell the task monitor about the new plans
		for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
			taskMonitor.tell(update, selfActor);
		}

		// send the acceptance message to Mesos
		schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to accept offers", ex));
	}
}
 
Example #20
Source File: KubernetesResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
private void recoverWorkerNodesFromPreviousAttempts() throws ResourceManagerException {
	final List<KubernetesPod> podList = kubeClient.getPodsWithLabels(KubernetesUtils.getTaskManagerLabels(clusterId));
	for (KubernetesPod pod : podList) {
		final KubernetesWorkerNode worker = new KubernetesWorkerNode(new ResourceID(pod.getName()));
		workerNodes.put(worker.getResourceID(), worker);
		final long attempt = worker.getAttempt();
		if (attempt > currentMaxAttemptId) {
			currentMaxAttemptId = attempt;
		}
	}

	log.info("Recovered {} pods from previous attempts, current attempt id is {}.",
		workerNodes.size(),
		++currentMaxAttemptId);
}
 
Example #21
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Handles a termination notification from a task monitor.
 */
public void taskTerminated(TaskMonitor.TaskTerminated message) {
	Protos.TaskID taskID = message.taskID();
	Protos.TaskStatus status = message.status();

	// note: this callback occurs for failed containers and for released containers alike
	final ResourceID id = extractResourceID(taskID);

	boolean existed;
	try {
		existed = workerStore.removeWorker(taskID);
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to remove worker", ex));
		return;
	}

	if (!existed) {
		LOG.info("Received a termination notice for an unrecognized worker: {}", id);
		return;
	}

	// check if this is a failed task or a released task
	assert(!workersInNew.containsKey(id));
	if (workersBeingReturned.remove(id) != null) {
		// regular finished worker that we released
		LOG.info("Worker {} finished successfully with message: {}",
			id, status.getMessage());
	} else {
		// failed worker, either at startup, or running
		final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
		assert(launched != null);
		LOG.info("Worker {} failed with status: {}, reason: {}, message: {}.",
			id, status.getState(), status.getReason(), status.getMessage());
		startNewWorker(launched.workerResourceSpec());
	}

	closeTaskManagerConnection(id, new Exception(status.getMessage()));
}
 
Example #22
Source File: SlotManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the slot request fails if we cannot allocate more resources.
 */
@Test
public void testSlotRequestWithResourceAllocationFailure() throws Exception {
	final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
	final ResourceProfile resourceProfile = new ResourceProfile(42.0, 1337);
	final SlotRequest slotRequest = new SlotRequest(
		new JobID(),
		new AllocationID(),
		resourceProfile,
		"localhost");

	ResourceActions resourceManagerActions = new TestingResourceActionsBuilder()
		.setAllocateResourceFunction(value -> {
			throw new ResourceManagerException("Test exception");
		})
		.build();

	try (SlotManager slotManager = createSlotManager(resourceManagerId, resourceManagerActions)) {

		slotManager.registerSlotRequest(slotRequest);

		fail("The slot request should have failed with a ResourceManagerException.");

	} catch (ResourceManagerException e) {
		// expected exception
	}
}
 
Example #23
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<Acknowledge> sendSlotReport(ResourceID taskManagerResourceId, InstanceID taskManagerRegistrationId, SlotReport slotReport, Time timeout) {
	final WorkerRegistration<WorkerType> workerTypeWorkerRegistration = taskExecutors.get(taskManagerResourceId);

	if (workerTypeWorkerRegistration.getInstanceID().equals(taskManagerRegistrationId)) {
		if (slotManager.registerTaskManager(workerTypeWorkerRegistration, slotReport)) {
			onTaskManagerRegistration(workerTypeWorkerRegistration);
		}
		return CompletableFuture.completedFuture(Acknowledge.get());
	} else {
		return FutureUtils.completedExceptionally(new ResourceManagerException(String.format("Unknown TaskManager registration id %s.", taskManagerRegistrationId)));
	}
}
 
Example #24
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<Acknowledge> requestSlot(
		JobMasterId jobMasterId,
		SlotRequest slotRequest,
		final Time timeout) {

	JobID jobId = slotRequest.getJobId();
	JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);

	if (null != jobManagerRegistration) {
		if (Objects.equals(jobMasterId, jobManagerRegistration.getJobMasterId())) {
			log.info("Request slot with profile {} for job {} with allocation id {}.",
				slotRequest.getResourceProfile(),
				slotRequest.getJobId(),
				slotRequest.getAllocationId());

			try {
				slotManager.registerSlotRequest(slotRequest);
			} catch (ResourceManagerException e) {
				return FutureUtils.completedExceptionally(e);
			}

			return CompletableFuture.completedFuture(Acknowledge.get());
		} else {
			return FutureUtils.completedExceptionally(new ResourceManagerException("The job leader's id " +
				jobManagerRegistration.getJobMasterId() + " does not match the received id " + jobMasterId + '.'));
		}

	} else {
		return FutureUtils.completedExceptionally(new ResourceManagerException("Could not find registered job manager for job " + jobId + '.'));
	}
}
 
Example #25
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
private void clearStateInternal() {
	jobManagerRegistrations.clear();
	jmResourceIdRegistrations.clear();
	taskExecutors.clear();

	try {
		jobLeaderIdService.clear();
	} catch (Exception e) {
		onFatalError(new ResourceManagerException("Could not properly clear the job leader id service.", e));
	}
	clearStateFuture = clearStateAsync();
}
 
Example #26
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
private void clearStateInternal() {
	jobManagerRegistrations.clear();
	jmResourceIdRegistrations.clear();
	taskExecutors.clear();

	try {
		jobLeaderIdService.clear();
	} catch (Exception e) {
		onFatalError(new ResourceManagerException("Could not properly clear the job leader id service.", e));
	}
	clearStateFuture = clearStateAsync();
}
 
Example #27
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Cleanup application and shut down cluster.
 *
 * @param finalStatus of the Flink application
 * @param diagnostics diagnostics message for the Flink application or {@code null}
 */
@Override
public CompletableFuture<Acknowledge> deregisterApplication(
		final ApplicationStatus finalStatus,
		@Nullable final String diagnostics) {
	log.info("Shut down cluster because application is in {}, diagnostics {}.", finalStatus, diagnostics);

	try {
		internalDeregisterApplication(finalStatus, diagnostics);
	} catch (ResourceManagerException e) {
		log.warn("Could not properly shutdown the application.", e);
	}

	return CompletableFuture.completedFuture(Acknowledge.get());
}
 
Example #28
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<Acknowledge> requestSlot(
		JobMasterId jobMasterId,
		SlotRequest slotRequest,
		final Time timeout) {

	JobID jobId = slotRequest.getJobId();
	JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);

	if (null != jobManagerRegistration) {
		if (Objects.equals(jobMasterId, jobManagerRegistration.getJobMasterId())) {
			log.info("Request slot with profile {} for job {} with allocation id {}.",
				slotRequest.getResourceProfile(),
				slotRequest.getJobId(),
				slotRequest.getAllocationId());

			try {
				slotManager.registerSlotRequest(slotRequest);
			} catch (ResourceManagerException e) {
				return FutureUtils.completedExceptionally(e);
			}

			return CompletableFuture.completedFuture(Acknowledge.get());
		} else {
			return FutureUtils.completedExceptionally(new ResourceManagerException("The job leader's id " +
				jobManagerRegistration.getJobMasterId() + " does not match the received id " + jobMasterId + '.'));
		}

	} else {
		return FutureUtils.completedExceptionally(new ResourceManagerException("Could not find registered job manager for job " + jobId + '.'));
	}
}
 
Example #29
Source File: ResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void onStart() throws Exception {
	try {
		startResourceManagerServices();
	} catch (Exception e) {
		final ResourceManagerException exception = new ResourceManagerException(String.format("Could not start the ResourceManager %s", getAddress()), e);
		onFatalError(exception);
		throw exception;
	}
}
 
Example #30
Source File: SlotManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private static FunctionWithException<ResourceProfile, Collection<ResourceProfile>, ResourceManagerException> convert(FunctionWithException<ResourceProfile, Integer, ResourceManagerException> function) {
	return (ResourceProfile resourceProfile) -> {
		final int slots = function.apply(resourceProfile);

		final ArrayList<ResourceProfile> result = new ArrayList<>(slots);
		for (int i = 0; i < slots; i++) {
			result.add(resourceProfile);
		}

		return result;
	};
}