org.apache.flink.mesos.scheduler.TaskMonitor Java Examples

The following examples show how to use org.apache.flink.mesos.scheduler.TaskMonitor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Allocate a worker using the RM.
 */
public MesosWorkerStore.Worker allocateWorker(Protos.TaskID taskID, ResourceProfile resourceProfile) throws Exception {
	when(rmServices.workerStore.newTaskID()).thenReturn(taskID);
	rmServices.slotManagerStarted.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	CompletableFuture<Void> allocateResourceFuture = resourceManager.callAsync(
		() -> {
			rmServices.rmActions.allocateResource(resourceProfile);
			return null;
		},
		timeout);
	MesosWorkerStore.Worker expected = MesosWorkerStore.Worker.newWorker(taskID, resourceProfile);

	// check for exceptions
	allocateResourceFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	// drain the probe messages
	verify(rmServices.workerStore, Mockito.timeout(timeout.toMilliseconds())).putWorker(expected);
	assertThat(resourceManager.workersInNew, hasEntry(extractResourceID(taskID), expected));
	resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
	resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Launch.class);
	return expected;
}
 
Example #2
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public Collection<ResourceProfile> startNewWorker(ResourceProfile resourceProfile) {
	LOG.info("Starting a new worker.");
	try {
		// generate new workers into persistent state and launch associated actors
		MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID(), resourceProfile);
		workerStore.putWorker(worker);
		workersInNew.put(extractResourceID(worker.taskID()), worker);

		LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

		LOG.info("Scheduling Mesos task {} with ({} MB, {} cpus).",
			launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs());

		// tell the task monitor about the new plans
		taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);

		// tell the launch coordinator to launch the new tasks
		launchCoordinator.tell(new LaunchCoordinator.Launch(Collections.singletonList(launchable)), selfActor);

		return slotsPerWorker;
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("Unable to request new workers.", ex));
		return Collections.emptyList();
	}
}
 
Example #3
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Test unplanned task failure of a pending worker.
 */
@Test
public void testWorkerFailed() throws Exception {
	new Context() {{
		// set the initial persistent state with a launched worker
		MesosWorkerStore.Worker worker1launched = MesosWorkerStore.Worker.newWorker(task1, workerResourceSpec).launchWorker(slave1, slave1host);
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(singletonList(worker1launched));
		when(rmServices.workerStore.newTaskID()).thenReturn(task2);
		startResourceManager();

		// tell the RM that a task failed
		when(rmServices.workerStore.removeWorker(task1)).thenReturn(true);
		resourceManager.taskTerminated(new TaskMonitor.TaskTerminated(task1, Protos.TaskStatus.newBuilder()
			.setTaskId(task1).setSlaveId(slave1).setState(Protos.TaskState.TASK_FAILED).build()));

		// verify that the instance state was updated
		verify(rmServices.workerStore).removeWorker(task1);
		assertThat(resourceManager.workersInLaunch.entrySet(), empty());
		assertThat(resourceManager.workersBeingReturned.entrySet(), empty());
		assertThat(resourceManager.workersInNew, hasKey(extractResourceID(task2)));

		// verify that `closeTaskManagerConnection` was called
		assertThat(resourceManager.closedTaskManagerConnections, hasItem(extractResourceID(task1)));
	}};
}
 
Example #4
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Allocate a worker using the RM.
 */
public MesosWorkerStore.Worker allocateWorker(Protos.TaskID taskID, WorkerResourceSpec workerResourceSpec) throws Exception {
	when(rmServices.workerStore.newTaskID()).thenReturn(taskID);
	rmServices.slotManagerStarted.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	CompletableFuture<Void> allocateResourceFuture = resourceManager.callAsync(
		() -> {
			rmServices.rmActions.allocateResource(workerResourceSpec);
			return null;
		},
		timeout);
	MesosWorkerStore.Worker expected = MesosWorkerStore.Worker.newWorker(taskID, workerResourceSpec);

	// check for exceptions
	allocateResourceFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	// drain the probe messages
	verify(rmServices.workerStore, Mockito.timeout(timeout.toMilliseconds())).putWorker(expected);
	assertThat(resourceManager.workersInNew, hasEntry(extractResourceID(taskID), expected));
	resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
	resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Launch.class);
	return expected;
}
 
Example #5
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Test unplanned task failure of a pending worker.
 */
@Test
public void testWorkerFailed() throws Exception {
	new Context() {{
		// set the initial persistent state with a launched worker
		MesosWorkerStore.Worker worker1launched = MesosWorkerStore.Worker.newWorker(task1).launchWorker(slave1, slave1host);
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(singletonList(worker1launched));
		when(rmServices.workerStore.newTaskID()).thenReturn(task2);
		startResourceManager();

		// tell the RM that a task failed
		when(rmServices.workerStore.removeWorker(task1)).thenReturn(true);
		resourceManager.taskTerminated(new TaskMonitor.TaskTerminated(task1, Protos.TaskStatus.newBuilder()
			.setTaskId(task1).setSlaveId(slave1).setState(Protos.TaskState.TASK_FAILED).build()));

		// verify that the instance state was updated
		verify(rmServices.workerStore).removeWorker(task1);
		assertThat(resourceManager.workersInLaunch.entrySet(), empty());
		assertThat(resourceManager.workersBeingReturned.entrySet(), empty());
		assertThat(resourceManager.workersInNew, hasKey(extractResourceID(task2)));

		// verify that `closeTaskManagerConnection` was called
		assertThat(resourceManager.closedTaskManagerConnections, hasItem(extractResourceID(task1)));
	}};
}
 
Example #6
Source File: MesosResourceManagerTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Allocate a worker using the RM.
 */
public MesosWorkerStore.Worker allocateWorker(Protos.TaskID taskID, ResourceProfile resourceProfile) throws Exception {
	when(rmServices.workerStore.newTaskID()).thenReturn(taskID);
	rmServices.slotManagerStarted.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	CompletableFuture<Void> allocateResourceFuture = resourceManager.callAsync(
		() -> {
			rmServices.rmActions.allocateResource(resourceProfile);
			return null;
		},
		timeout);
	MesosWorkerStore.Worker expected = MesosWorkerStore.Worker.newWorker(taskID, resourceProfile);

	// check for exceptions
	allocateResourceFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

	// drain the probe messages
	verify(rmServices.workerStore, Mockito.timeout(timeout.toMilliseconds())).putWorker(expected);
	assertThat(resourceManager.workersInNew, hasEntry(extractResourceID(taskID), expected));
	resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
	resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Launch.class);
	return expected;
}
 
Example #7
Source File: MesosResourceManagerTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Test unplanned task failure of a pending worker.
 */
@Test
public void testWorkerFailed() throws Exception {
	new Context() {{
		// set the initial persistent state with a launched worker
		MesosWorkerStore.Worker worker1launched = MesosWorkerStore.Worker.newWorker(task1).launchWorker(slave1, slave1host);
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(singletonList(worker1launched));
		when(rmServices.workerStore.newTaskID()).thenReturn(task2);
		startResourceManager();

		// tell the RM that a task failed
		when(rmServices.workerStore.removeWorker(task1)).thenReturn(true);
		resourceManager.taskTerminated(new TaskMonitor.TaskTerminated(task1, Protos.TaskStatus.newBuilder()
			.setTaskId(task1).setSlaveId(slave1).setState(Protos.TaskState.TASK_FAILED).build()));

		// verify that the instance state was updated
		verify(rmServices.workerStore).removeWorker(task1);
		assertThat(resourceManager.workersInLaunch.entrySet(), empty());
		assertThat(resourceManager.workersBeingReturned.entrySet(), empty());
		assertThat(resourceManager.workersInNew, hasKey(extractResourceID(task2)));

		// verify that `closeTaskManagerConnection` was called
		assertThat(resourceManager.closedTaskManagerConnections, hasItem(extractResourceID(task1)));
	}};
}
 
Example #8
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Handles a termination notification from a task monitor.
 */
public void taskTerminated(TaskMonitor.TaskTerminated message) {
	Protos.TaskID taskID = message.taskID();
	Protos.TaskStatus status = message.status();

	// note: this callback occurs for failed containers and for released containers alike
	final ResourceID id = extractResourceID(taskID);

	boolean existed;
	try {
		existed = workerStore.removeWorker(taskID);
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to remove worker", ex));
		return;
	}

	if (!existed) {
		LOG.info("Received a termination notice for an unrecognized worker: {}", id);
		return;
	}

	// check if this is a failed task or a released task
	assert(!workersInNew.containsKey(id));
	if (workersBeingReturned.remove(id) != null) {
		// regular finished worker that we released
		LOG.info("Worker {} finished successfully with message: {}",
			id, status.getMessage());
	} else {
		// failed worker, either at startup, or running
		final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
		assert(launched != null);
		LOG.info("Worker {} failed with status: {}, reason: {}, message: {}.",
			id, status.getState(), status.getReason(), status.getMessage());
		startNewWorker(launched.workerResourceSpec());
	}

	closeTaskManagerConnection(id, new Exception(status.getMessage()));
}
 
Example #9
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test recovery of persistent workers.
 */
@Test
public void testRecoverWorkers() throws Exception {
	new Context() {{
		// set the initial persistent state then initialize the RM
		MesosWorkerStore.Worker worker1 = MesosWorkerStore.Worker.newWorker(task1);
		MesosWorkerStore.Worker worker2 = MesosWorkerStore.Worker.newWorker(task2).launchWorker(slave1, slave1host);
		MesosWorkerStore.Worker worker3 = MesosWorkerStore.Worker.newWorker(task3).launchWorker(slave1, slave1host).releaseWorker();
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(Arrays.asList(worker1, worker2, worker3));
		startResourceManager();

		// verify that the internal state was updated, the task router was notified,
		// and the launch coordinator was asked to launch a task.
		// note: "new" workers are discarded
		assertThat(resourceManager.workersInNew.entrySet(), empty());
		assertThat(resourceManager.workersInLaunch, hasEntry(extractResourceID(task2), worker2));
		assertThat(resourceManager.workersBeingReturned, hasEntry(extractResourceID(task3), worker3));
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		LaunchCoordinator.Assign actualAssign =
			resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Assign.class);
		assertThat(actualAssign.tasks(), hasSize(1));
		assertThat(actualAssign.tasks().get(0).f0.getId(), equalTo(task2.getValue()));
		assertThat(actualAssign.tasks().get(0).f1, equalTo(slave1host));
		resourceManager.launchCoordinator.expectNoMsg();
	}};
}
 
Example #10
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test request for new workers.
 */
@Test
public void testRequestNewWorkers() throws Exception {
	new Context() {{
		startResourceManager();

		// allocate a worker
		when(rmServices.workerStore.newTaskID()).thenReturn(task1).thenThrow(new AssertionFailedError());
		rmServices.slotManagerStarted.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

		CompletableFuture<Void> allocateResourceFuture = resourceManager.callAsync(
			() -> {
				rmServices.rmActions.allocateResource(resourceProfile1);
				return null;
			},
			timeout);

		// check for exceptions
		allocateResourceFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

		// verify that a new worker was persisted, the internal state was updated, the task router was notified,
		// and the launch coordinator was asked to launch a task
		MesosWorkerStore.Worker expected = MesosWorkerStore.Worker.newWorker(task1, resourceProfile1);
		verify(rmServices.workerStore, Mockito.timeout(timeout.toMilliseconds())).putWorker(expected);
		assertThat(resourceManager.workersInNew, hasEntry(extractResourceID(task1), expected));
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Launch.class);
	}};
}
 
Example #11
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test offer acceptance.
 */
@Test
public void testAcceptOffers() throws Exception {
	new Context() {{
		startResourceManager();

		// allocate a new worker
		MesosWorkerStore.Worker worker1 = allocateWorker(task1, resourceProfile1);

		// send an AcceptOffers message as the LaunchCoordinator would
		// to launch task1 onto slave1 with offer1
		Protos.TaskInfo task1info = Protos.TaskInfo.newBuilder()
			.setTaskId(task1).setName("").setSlaveId(slave1).build();
		AcceptOffers msg = new AcceptOffers(slave1host, singletonList(offer1), singletonList(launch(task1info)));
		resourceManager.acceptOffers(msg);

		// verify that the worker was persisted, the internal state was updated,
		// Mesos was asked to launch task1, and the task router was notified
		MesosWorkerStore.Worker worker1launched = worker1.launchWorker(slave1, slave1host);
		verify(rmServices.workerStore).putWorker(worker1launched);
		assertThat(resourceManager.workersInNew.entrySet(), empty());
		assertThat(resourceManager.workersInLaunch, hasEntry(extractResourceID(task1), worker1launched));
		resourceManager.taskRouter.expectMsg(
			new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker1launched)));
		verify(rmServices.schedulerDriver).acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	}};
}
 
Example #12
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test planned stop of a launched worker.
 */
@Test
public void testStopWorker() throws Exception {
	new Context() {{
		// set the initial persistent state with a launched worker
		MesosWorkerStore.Worker worker1launched = MesosWorkerStore.Worker.newWorker(task1).launchWorker(slave1, slave1host);
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(singletonList(worker1launched));
		startResourceManager();

		// drain the assign message
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Assign.class);

		// tell the RM to stop the worker
		resourceManager.stopWorker(new RegisteredMesosWorkerNode(worker1launched));

		// verify that the instance state was updated
		MesosWorkerStore.Worker worker1Released = worker1launched.releaseWorker();
		verify(rmServices.workerStore).putWorker(worker1Released);
		assertThat(resourceManager.workersInLaunch.entrySet(), empty());
		assertThat(resourceManager.workersBeingReturned, hasEntry(extractResourceID(task1), worker1Released));

		// verify that the monitor was notified
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Unassign.class);
	}};
}
 
Example #13
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Recovers given framework/worker information.
 *
 * @see #getWorkersAsync()
 */
private void recoverWorkers(final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts) {
	assert(workersInNew.isEmpty());
	assert(workersInLaunch.isEmpty());
	assert(workersBeingReturned.isEmpty());

	if (!tasksFromPreviousAttempts.isEmpty()) {
		LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size());

		List<Tuple2<TaskRequest, String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size());

		for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
			switch(worker.state()) {
				case Launched:
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);
					final LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());
					toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get()));
					break;
				case Released:
					workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
					break;
			}
			taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);
		}

		// tell the launch coordinator about prior assignments
		if (toAssign.size() >= 1) {
			launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), selfActor);
		}
	}
}
 
Example #14
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public boolean startNewWorker(WorkerResourceSpec workerResourceSpec) {
	Preconditions.checkArgument(Objects.equals(
		workerResourceSpec,
		WorkerResourceSpec.fromTaskExecutorProcessSpec(taskManagerParameters.containeredParameters().getTaskExecutorProcessSpec())));
	LOG.info("Starting a new worker.");
	try {
		// generate new workers into persistent state and launch associated actors
		MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID(), workerResourceSpec);
		workerStore.putWorker(worker);
		workersInNew.put(extractResourceID(worker.taskID()), worker);

		LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

		LOG.info("Scheduling Mesos task {} with ({} MB, {} cpus, {} gpus, {} disk MB, {} Mbps).",
			launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs(),
			launchable.taskRequest().getScalarRequests().get("gpus"), launchable.taskRequest().getDisk(), launchable.taskRequest().getNetworkMbps());

		// tell the task monitor about the new plans
		taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);

		// tell the launch coordinator to launch the new tasks
		launchCoordinator.tell(new LaunchCoordinator.Launch(Collections.singletonList(launchable)), selfActor);

		return true;
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("Unable to request new workers.", ex));
		return false;
	}
}
 
Example #15
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public boolean stopWorker(RegisteredMesosWorkerNode workerNode) {
	LOG.info("Stopping worker {}.", workerNode.getResourceID());
	try {

		if (workersInLaunch.containsKey(workerNode.getResourceID())) {
			// update persistent state of worker to Released
			MesosWorkerStore.Worker worker = workersInLaunch.remove(workerNode.getResourceID());
			worker = worker.releaseWorker();
			workerStore.putWorker(worker);
			workersBeingReturned.put(extractResourceID(worker.taskID()), worker);

			taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);

			if (worker.hostname().isDefined()) {
				// tell the launch coordinator that the task is being unassigned from the host, for planning purposes
				launchCoordinator.tell(new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), selfActor);
			}
		}
		else if (workersBeingReturned.containsKey(workerNode.getResourceID())) {
			LOG.info("Ignoring request to stop worker {} because it is already being stopped.", workerNode.getResourceID());
		}
		else {
			LOG.warn("Unrecognized worker {}.", workerNode.getResourceID());
		}
	}
	catch (Exception e) {
		onFatalError(new ResourceManagerException("Unable to release a worker.", e));
	}

	return true;
}
 
Example #16
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Accept offers as advised by the launch coordinator.
 *
 * <p>Acceptance is routed through the RM to update the persistent state before
 * forwarding the message to Mesos.
 */
public void acceptOffers(AcceptOffers msg) {
	try {
		List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());

		// transition the persistent state of some tasks to Launched
		for (Protos.Offer.Operation op : msg.operations()) {
			if (op.getType() == Protos.Offer.Operation.Type.LAUNCH) {
				for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
					MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
					assert (worker != null);

					worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
					workerStore.putWorker(worker);
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);

					LOG.info("Launching Mesos task {} on host {}.",
						worker.taskID().getValue(), worker.hostname().get());

					toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
				}
			}
		}

		// tell the task monitor about the new plans
		for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
			taskMonitor.tell(update, selfActor);
		}

		// send the acceptance message to Mesos
		schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to accept offers", ex));
	}
}
 
Example #17
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void onReceive(final Object message) throws Exception {
	if (message instanceof ReconciliationCoordinator.Reconcile) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				reconcile((ReconciliationCoordinator.Reconcile) message);
			}
		});
	} else if (message instanceof TaskMonitor.TaskTerminated) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				taskTerminated((TaskMonitor.TaskTerminated) message);
			}
		});
	} else if (message instanceof AcceptOffers) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				acceptOffers((AcceptOffers) message);
			}
		});
	} else {
		MesosResourceManager.LOG.error("unrecognized message: " + message);
	}
}
 
Example #18
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Extracts the Mesos task goal state from the worker information.
 *
 * @param worker the persistent worker information.
 * @return goal state information for the {@Link TaskMonitor}.
 */
static TaskMonitor.TaskGoalState extractGoalState(MesosWorkerStore.Worker worker) {
	switch(worker.state()) {
		case New: return new TaskMonitor.New(worker.taskID());
		case Launched: return new TaskMonitor.Launched(worker.taskID(), worker.slaveID().get());
		case Released: return new TaskMonitor.Released(worker.taskID(), worker.slaveID().get());
		default: throw new IllegalArgumentException("unsupported worker state");
	}
}
 
Example #19
Source File: MesosResourceManager.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void onReceive(final Object message) throws Exception {
	if (message instanceof ReconciliationCoordinator.Reconcile) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				reconcile((ReconciliationCoordinator.Reconcile) message);
			}
		});
	} else if (message instanceof TaskMonitor.TaskTerminated) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				taskTerminated((TaskMonitor.TaskTerminated) message);
			}
		});
	} else if (message instanceof AcceptOffers) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				acceptOffers((AcceptOffers) message);
			}
		});
	} else {
		MesosResourceManager.LOG.error("unrecognized message: " + message);
	}
}
 
Example #20
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test recovery of persistent workers.
 */
@Test
public void testRecoverWorkers() throws Exception {
	new Context() {{
		// set the initial persistent state then initialize the RM
		MesosWorkerStore.Worker worker1 = MesosWorkerStore.Worker.newWorker(task1, workerResourceSpec);
		MesosWorkerStore.Worker worker2 = MesosWorkerStore.Worker.newWorker(task2, workerResourceSpec).launchWorker(slave1, slave1host);
		MesosWorkerStore.Worker worker3 = MesosWorkerStore.Worker.newWorker(task3, workerResourceSpec).launchWorker(slave1, slave1host).releaseWorker();
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(Arrays.asList(worker1, worker2, worker3));
		startResourceManager();

		// verify that the internal state was updated, the task router was notified,
		// and the launch coordinator was asked to launch a task.
		// note: "new" workers are discarded
		assertThat(resourceManager.workersInNew.entrySet(), empty());
		assertThat(resourceManager.workersInLaunch, hasEntry(extractResourceID(task2), worker2));
		assertThat(resourceManager.workersBeingReturned, hasEntry(extractResourceID(task3), worker3));
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		LaunchCoordinator.Assign actualAssign =
			resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Assign.class);
		assertThat(actualAssign.tasks(), hasSize(1));
		assertThat(actualAssign.tasks().get(0).f0.getId(), equalTo(task2.getValue()));
		assertThat(actualAssign.tasks().get(0).f1, equalTo(slave1host));
		resourceManager.launchCoordinator.expectNoMsg();
	}};
}
 
Example #21
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test request for new workers.
 */
@Test
public void testRequestNewWorkers() throws Exception {
	new Context() {{
		startResourceManager();

		// allocate a worker
		when(rmServices.workerStore.newTaskID()).thenReturn(task1).thenThrow(new AssertionFailedError());
		rmServices.slotManagerStarted.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

		CompletableFuture<Void> allocateResourceFuture = resourceManager.callAsync(
			() -> {
				rmServices.rmActions.allocateResource(workerResourceSpec);
				return null;
			},
			timeout);

		// check for exceptions
		allocateResourceFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS);

		// verify that a new worker was persisted, the internal state was updated, the task router was notified,
		// and the launch coordinator was asked to launch a task
		MesosWorkerStore.Worker expected = MesosWorkerStore.Worker.newWorker(task1, workerResourceSpec);
		verify(rmServices.workerStore, Mockito.timeout(timeout.toMilliseconds())).putWorker(expected);
		assertThat(resourceManager.workersInNew, hasEntry(extractResourceID(task1), expected));
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Launch.class);
	}};
}
 
Example #22
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test offer acceptance.
 */
@Test
public void testAcceptOffers() throws Exception {
	new Context() {{
		startResourceManager();

		// allocate a new worker
		MesosWorkerStore.Worker worker1 = allocateWorker(task1, workerResourceSpec);

		// send an AcceptOffers message as the LaunchCoordinator would
		// to launch task1 onto slave1 with offer1
		Protos.TaskInfo task1info = Protos.TaskInfo.newBuilder()
			.setTaskId(task1).setName("").setSlaveId(slave1).build();
		AcceptOffers msg = new AcceptOffers(slave1host, singletonList(offer1), singletonList(launch(task1info)));
		resourceManager.acceptOffers(msg);

		// verify that the worker was persisted, the internal state was updated,
		// Mesos was asked to launch task1, and the task router was notified
		MesosWorkerStore.Worker worker1launched = worker1.launchWorker(slave1, slave1host);
		verify(rmServices.workerStore).putWorker(worker1launched);
		assertThat(resourceManager.workersInNew.entrySet(), empty());
		assertThat(resourceManager.workersInLaunch, hasEntry(extractResourceID(task1), worker1launched));
		resourceManager.taskRouter.expectMsg(
			new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker1launched)));
		verify(rmServices.schedulerDriver).acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	}};
}
 
Example #23
Source File: MesosResourceManagerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test planned stop of a launched worker.
 */
@Test
public void testStopWorker() throws Exception {
	new Context() {{
		// set the initial persistent state with a launched worker
		MesosWorkerStore.Worker worker1launched = MesosWorkerStore.Worker.newWorker(task1, workerResourceSpec).launchWorker(slave1, slave1host);
		when(rmServices.workerStore.getFrameworkID()).thenReturn(Option.apply(framework1));
		when(rmServices.workerStore.recoverWorkers()).thenReturn(singletonList(worker1launched));
		startResourceManager();

		// drain the assign message
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Assign.class);

		// tell the RM to stop the worker
		resourceManager.stopWorker(new RegisteredMesosWorkerNode(worker1launched));

		// verify that the instance state was updated
		MesosWorkerStore.Worker worker1Released = worker1launched.releaseWorker();
		verify(rmServices.workerStore).putWorker(worker1Released);
		assertThat(resourceManager.workersInLaunch.entrySet(), empty());
		assertThat(resourceManager.workersBeingReturned, hasEntry(extractResourceID(task1), worker1Released));

		// verify that the monitor was notified
		resourceManager.taskRouter.expectMsgClass(TaskMonitor.TaskGoalStateUpdated.class);
		resourceManager.launchCoordinator.expectMsgClass(LaunchCoordinator.Unassign.class);
	}};
}
 
Example #24
Source File: MesosResourceManagerTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Test offer acceptance.
 */
@Test
public void testAcceptOffers() throws Exception {
	new Context() {{
		startResourceManager();

		// allocate a new worker
		MesosWorkerStore.Worker worker1 = allocateWorker(task1, resourceProfile1);

		// send an AcceptOffers message as the LaunchCoordinator would
		// to launch task1 onto slave1 with offer1
		Protos.TaskInfo task1info = Protos.TaskInfo.newBuilder()
			.setTaskId(task1).setName("").setSlaveId(slave1).build();
		AcceptOffers msg = new AcceptOffers(slave1host, singletonList(offer1), singletonList(launch(task1info)));
		resourceManager.acceptOffers(msg);

		// verify that the worker was persisted, the internal state was updated,
		// Mesos was asked to launch task1, and the task router was notified
		MesosWorkerStore.Worker worker1launched = worker1.launchWorker(slave1, slave1host);
		verify(rmServices.workerStore).putWorker(worker1launched);
		assertThat(resourceManager.workersInNew.entrySet(), empty());
		assertThat(resourceManager.workersInLaunch, hasEntry(extractResourceID(task1), worker1launched));
		resourceManager.taskRouter.expectMsg(
			new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker1launched)));
		verify(rmServices.schedulerDriver).acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	}};
}
 
Example #25
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Recovers given framework/worker information.
 *
 * @see #getWorkersAsync()
 */
private void recoverWorkers(final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts) {
	assert(workersInNew.isEmpty());
	assert(workersInLaunch.isEmpty());
	assert(workersBeingReturned.isEmpty());

	if (!tasksFromPreviousAttempts.isEmpty()) {
		LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size());

		List<Tuple2<TaskRequest, String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size());

		for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
			switch(worker.state()) {
				case Launched:
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);
					final LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());
					toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get()));
					break;
				case Released:
					workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
					break;
			}
			taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);
		}

		// tell the launch coordinator about prior assignments
		if (toAssign.size() >= 1) {
			launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), selfActor);
		}
	}
}
 
Example #26
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public boolean stopWorker(RegisteredMesosWorkerNode workerNode) {
	LOG.info("Stopping worker {}.", workerNode.getResourceID());
	try {

		if (workersInLaunch.containsKey(workerNode.getResourceID())) {
			// update persistent state of worker to Released
			MesosWorkerStore.Worker worker = workersInLaunch.remove(workerNode.getResourceID());
			worker = worker.releaseWorker();
			workerStore.putWorker(worker);
			workersBeingReturned.put(extractResourceID(worker.taskID()), worker);

			taskMonitor.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), selfActor);

			if (worker.hostname().isDefined()) {
				// tell the launch coordinator that the task is being unassigned from the host, for planning purposes
				launchCoordinator.tell(new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), selfActor);
			}
		}
		else if (workersBeingReturned.containsKey(workerNode.getResourceID())) {
			LOG.info("Ignoring request to stop worker {} because it is already being stopped.", workerNode.getResourceID());
		}
		else {
			LOG.warn("Unrecognized worker {}.", workerNode.getResourceID());
		}
	}
	catch (Exception e) {
		onFatalError(new ResourceManagerException("Unable to release a worker.", e));
	}

	return true;
}
 
Example #27
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Accept offers as advised by the launch coordinator.
 *
 * <p>Acceptance is routed through the RM to update the persistent state before
 * forwarding the message to Mesos.
 */
public void acceptOffers(AcceptOffers msg) {
	try {
		List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());

		// transition the persistent state of some tasks to Launched
		for (Protos.Offer.Operation op : msg.operations()) {
			if (op.getType() == Protos.Offer.Operation.Type.LAUNCH) {
				for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
					MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
					assert (worker != null);

					worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
					workerStore.putWorker(worker);
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);

					LOG.info("Launching Mesos task {} on host {}.",
						worker.taskID().getValue(), worker.hostname().get());

					toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
				}
			}
		}

		// tell the task monitor about the new plans
		for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
			taskMonitor.tell(update, selfActor);
		}

		// send the acceptance message to Mesos
		schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to accept offers", ex));
	}
}
 
Example #28
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Handles a termination notification from a task monitor.
 */
public void taskTerminated(TaskMonitor.TaskTerminated message) {
	Protos.TaskID taskID = message.taskID();
	Protos.TaskStatus status = message.status();

	// note: this callback occurs for failed containers and for released containers alike
	final ResourceID id = extractResourceID(taskID);

	boolean existed;
	try {
		existed = workerStore.removeWorker(taskID);
	} catch (Exception ex) {
		onFatalError(new ResourceManagerException("unable to remove worker", ex));
		return;
	}

	if (!existed) {
		LOG.info("Received a termination notice for an unrecognized worker: {}", id);
		return;
	}

	// check if this is a failed task or a released task
	assert(!workersInNew.containsKey(id));
	if (workersBeingReturned.remove(id) != null) {
		// regular finished worker that we released
		LOG.info("Worker {} finished successfully with message: {}",
			id, status.getMessage());
	} else {
		// failed worker, either at startup, or running
		final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
		assert(launched != null);
		LOG.info("Worker {} failed with status: {}, reason: {}, message: {}.",
			id, status.getState(), status.getReason(), status.getMessage());
		startNewWorker(launched.profile());
	}

	closeTaskManagerConnection(id, new Exception(status.getMessage()));
}
 
Example #29
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Extracts the Mesos task goal state from the worker information.
 *
 * @param worker the persistent worker information.
 * @return goal state information for the {@Link TaskMonitor}.
 */
static TaskMonitor.TaskGoalState extractGoalState(MesosWorkerStore.Worker worker) {
	switch(worker.state()) {
		case New: return new TaskMonitor.New(worker.taskID());
		case Launched: return new TaskMonitor.Launched(worker.taskID(), worker.slaveID().get());
		case Released: return new TaskMonitor.Released(worker.taskID(), worker.slaveID().get());
		default: throw new IllegalArgumentException("unsupported worker state");
	}
}
 
Example #30
Source File: MesosResourceManager.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public void onReceive(final Object message) throws Exception {
	if (message instanceof ReconciliationCoordinator.Reconcile) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				reconcile((ReconciliationCoordinator.Reconcile) message);
			}
		});
	} else if (message instanceof TaskMonitor.TaskTerminated) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				taskTerminated((TaskMonitor.TaskTerminated) message);
			}
		});
	} else if (message instanceof AcceptOffers) {
		runAsync(new Runnable() {
			@Override
			public void run() {
				acceptOffers((AcceptOffers) message);
			}
		});
	} else {
		MesosResourceManager.LOG.error("unrecognized message: " + message);
	}
}