org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint Java Exaples

Source File: FailoverRegionTest.java From flink with Apache License 2.0

6 votes

/**
 * Let the checkpoint coordinator to receive all acknowledges from given executionVertexes so that to complete the expected checkpoint.
 */
private void acknowledgeAllCheckpoints(CheckpointCoordinator checkpointCoordinator, Iterator<ExecutionVertex> executionVertexes) throws IOException, CheckpointException {
	while (executionVertexes.hasNext()) {
		ExecutionVertex executionVertex = executionVertexes.next();
		for (int index = 0; index < executionVertex.getJobVertex().getParallelism(); index++) {
			JobVertexID jobVertexID = executionVertex.getJobvertexId();
			OperatorStateHandle opStateBackend = CheckpointCoordinatorTest.generatePartitionableStateHandle(jobVertexID, index, 2, 8, false);
			OperatorSubtaskState operatorSubtaskState = new OperatorSubtaskState(opStateBackend, null, null, null);
			TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
			taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);

			AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
				executionVertex.getJobId(),
				executionVertex.getJobVertex().getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
				checkpointId,
				new CheckpointMetrics(),
				taskOperatorSubtaskStates);

			checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
		}
	}
}

Source File: ActorGatewayCheckpointResponder.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		CheckpointMetrics checkpointMetrics,
		TaskStateSnapshot checkpointStateHandles) {

	AcknowledgeCheckpoint message = new AcknowledgeCheckpoint(
			jobID, executionAttemptID, checkpointId, checkpointMetrics,
			checkpointStateHandles);

	actorGateway.tell(message);
}

Source File: SchedulerTestingUtils.java From flink with Apache License 2.0

5 votes

public static void acknowledgePendingCheckpoint(final DefaultScheduler scheduler, final long checkpointId) throws CheckpointException {
	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
	final JobID jid = scheduler.getJobId();

	for (ExecutionAttemptID attemptId : getAllCurrentExecutionAttempts(scheduler)) {
		final AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, attemptId, checkpointId);
		checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	}
}

Source File: CheckpointMessagesTest.java From flink with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	final Random rnd = new Random();
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42, 42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTestingUtils.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTestingUtils.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null,
				singleton(createNewInputChannelStateHandle(10, rnd)),
				singleton(createNewResultSubpartitionStateHandle(10, rnd))
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: SchedulerBase.java From flink with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Source File: AdaptedRestartPipelinedRegionStrategyNGAbortPendingCheckpointsTest.java From flink with Apache License 2.0

5 votes

@Test
public void abortPendingCheckpointsWhenRestartingTasks() throws Exception {
	final JobGraph jobGraph = createStreamingJobGraph();
	final ExecutionGraph executionGraph = createExecutionGraph(jobGraph);

	final Iterator<ExecutionVertex> vertexIterator = executionGraph.getAllExecutionVertices().iterator();
	final ExecutionVertex firstExecutionVertex = vertexIterator.next();

	setTasksRunning(executionGraph, firstExecutionVertex, vertexIterator.next());

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	checkState(checkpointCoordinator != null);

	checkpointCoordinator.triggerCheckpoint(System.currentTimeMillis(),  false);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();

	AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
		jobGraph.getJobID(),
		firstExecutionVertex.getCurrentExecutionAttempt().getAttemptId(),
		checkpointId);

	// let the first vertex acknowledge the checkpoint, and fail it afterwards
	// the failover strategy should then cancel all pending checkpoints on restart
	checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());

	failVertex(firstExecutionVertex);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	manualMainThreadExecutor.triggerScheduledTasks();

	assertNoPendingCheckpoints(checkpointCoordinator);
}

Source File: CheckpointMessagesTest.java From flink with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42,42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTest.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTest.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: LegacyScheduler.java From flink with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Source File: CheckpointMessagesTest.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42,42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTest.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTest.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: JobMaster.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(
		final JobID jobID,
		final ExecutionAttemptID executionAttemptID,
		final long checkpointId,
		final CheckpointMetrics checkpointMetrics,
		final TaskStateSnapshot checkpointState) {

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	if (checkpointCoordinator != null) {
		getRpcService().execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes
 * and subsumes the first checkpoint, but not the first savepoint. Then we
 * trigger another checkpoint and savepoint. The 2nd savepoint completes and
 * subsumes the last checkpoint, but not the first savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
	final JobID jid = new JobID();
	final long timestamp = System.currentTimeMillis();

	// create some mock Execution vertices that receive the checkpoint trigger messages
	final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
	final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
	ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
	ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

	StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		new ExecutionVertex[] { vertex1, vertex2 },
		new ExecutionVertex[] { vertex1, vertex2 },
		new ExecutionVertex[] { vertex1, vertex2 },
		counter,
		new StandaloneCompletedCheckpointStore(10),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY);

	String savepointDir = tmpFolder.newFolder().getAbsolutePath();

	// Trigger savepoint and checkpoint
	CompletableFuture<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(timestamp, savepointDir);
	long savepointId1 = counter.getLast();
	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	assertTrue(coord.triggerCheckpoint(timestamp + 1, false));
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
	long checkpointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());

	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
	assertFalse(savepointFuture1.isDone());

	assertTrue(coord.triggerCheckpoint(timestamp + 3, false));
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(timestamp + 4, savepointDir);
	long savepointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2));

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());

	assertFalse(savepointFuture1.isDone());
	assertTrue(savepointFuture2.isDone());

	// Ack first savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1));

	assertEquals(0, coord.getNumberOfPendingCheckpoints());
	assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertTrue(savepointFuture1.isDone());
}

Source File: CheckpointCoordinator.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Receives an AcknowledgeCheckpoint message and returns whether the
 * message was associated with a pending checkpoint.
 *
 * @param message Checkpoint ack from the task manager
 *
 * @return Flag indicating whether the ack'd checkpoint was associated
 * with a pending checkpoint.
 *
 * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint store.
 */
public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws CheckpointException {
	if (shutdown || message == null) {
		return false;
	}

	if (!job.equals(message.getJob())) {
		LOG.error("Received wrong AcknowledgeCheckpoint message for job {}: {}", job, message);
		return false;
	}

	final long checkpointId = message.getCheckpointId();

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return false;
		}

		final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {

			switch (checkpoint.acknowledgeTask(message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics())) {
				case SUCCESS:
					LOG.debug("Received acknowledge message for checkpoint {} from task {} of job {}.",
						checkpointId, message.getTaskExecutionId(), message.getJob());

					if (checkpoint.isFullyAcknowledged()) {
						completePendingCheckpoint(checkpoint);
					}
					break;
				case DUPLICATE:
					LOG.debug("Received a duplicate acknowledge message for checkpoint {}, task {}, job {}.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob());
					break;
				case UNKNOWN:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {}, " +
							"because the task's execution attempt id was unknown. Discarding " +
							"the state handle to avoid lingering state.", message.getCheckpointId(),
						message.getTaskExecutionId(), message.getJob());

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

					break;
				case DISCARDED:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {}, " +
						"because the pending checkpoint had been discarded. Discarding the " +
							"state handle tp avoid lingering state.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob());

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
			}

			return true;
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else {
			boolean wasPendingCheckpoint;

			// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
			if (recentPendingCheckpoints.contains(checkpointId)) {
				wasPendingCheckpoint = true;
				LOG.warn("Received late message for now expired checkpoint attempt {} from " +
					"{} of job {}.", checkpointId, message.getTaskExecutionId(), message.getJob());
			}
			else {
				LOG.debug("Received message for an unknown checkpoint {} from {} of job {}.",
					checkpointId, message.getTaskExecutionId(), message.getJob());
				wasPendingCheckpoint = false;
			}

			// try to discard the state so that we don't have lingering state lying around
			discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

			return wasPendingCheckpoint;
		}
	}
}

Source File: CheckpointCoordinatorMasterHooksTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHooksAreCalledOnTrigger() throws Exception {
	final String id1 = "id1";
	final String id2 = "id2";

	final String state1 = "the-test-string-state";
	final byte[] state1serialized = new StringSerializer().serialize(state1);

	final long state2 = 987654321L;
	final byte[] state2serialized = new LongSerializer().serialize(state2);

	final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook1.getIdentifier()).thenReturn(id1);
	when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
	when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state1));

	final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook2.getIdentifier()).thenReturn(id2);
	when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
	when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state2));

	final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
	when(statelessHook.getIdentifier()).thenReturn("some-id");

	// create the checkpoint coordinator
	final JobID jid = new JobID();
	final ExecutionAttemptID execId = new ExecutionAttemptID();
	final ExecutionVertex ackVertex = mockExecutionVertex(execId);
	final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor =
		new ManuallyTriggeredScheduledExecutor();
	final CheckpointCoordinator cc = instantiateCheckpointCoordinator(
		jid, manuallyTriggeredScheduledExecutor, ackVertex);

	cc.addMasterHook(statefulHook1);
	cc.addMasterHook(statelessHook);
	cc.addMasterHook(statefulHook2);

	// trigger a checkpoint
	final CompletableFuture<CompletedCheckpoint> checkpointFuture = cc.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture.isCompletedExceptionally());
	assertEquals(1, cc.getNumberOfPendingCheckpoints());

	verify(statefulHook1, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statefulHook2, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statelessHook, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));

	final long checkpointId = cc.getPendingCheckpoints().values().iterator().next().getCheckpointId();
	cc.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, execId, checkpointId), "Unknown location");
	assertEquals(0, cc.getNumberOfPendingCheckpoints());

	assertEquals(1, cc.getNumberOfRetainedSuccessfulCheckpoints());
	final CompletedCheckpoint chk = cc.getCheckpointStore().getLatestCheckpoint(false);

	final Collection<MasterState> masterStates = chk.getMasterHookStates();
	assertEquals(2, masterStates.size());

	for (MasterState ms : masterStates) {
		if (ms.name().equals(id1)) {
			assertArrayEquals(state1serialized, ms.bytes());
			assertEquals(StringSerializer.VERSION, ms.version());
		}
		else if (ms.name().equals(id2)) {
			assertArrayEquals(state2serialized, ms.bytes());
			assertEquals(LongSerializer.VERSION, ms.version());
		}
		else {
			fail("unrecognized state name: " + ms.name());
		}
	}
}

Source File: CheckpointCoordinatorFailureTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor =
		new ManuallyTriggeredScheduledExecutor();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTestingUtils.mockExecutionVertex(executionAttemptId);

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord =
		new CheckpointCoordinatorBuilder()
			.setJobId(jid)
			.setTasks(new ExecutionVertex[] { vertex })
			.setCompletedCheckpointStore(new FailingCompletedCheckpointStore())
			.setTimer(manuallyTriggeredScheduledExecutor)
			.build();

	coord.triggerCheckpoint(false);

	manuallyTriggeredScheduledExecutor.triggerAll();

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);
	InputChannelStateHandle inputChannelStateHandle = new InputChannelStateHandle(new InputChannelInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
	ResultSubpartitionStateHandle resultSubpartitionStateHandle = new ResultSubpartitionStateHandle(new ResultSubpartitionInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle,
		StateObjectCollection.singleton(inputChannelStateHandle),
		StateObjectCollection.singleton(resultSubpartitionStateHandle)));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getInputChannelState().iterator().next().getDelegate()).discardState();
	verify(operatorSubtaskState.getResultSubpartitionState().iterator().next().getDelegate()).discardState();
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void performIncrementalCheckpoint(
	JobID jid,
	CheckpointCoordinator coord,
	ExecutionJobVertex jobVertex1,
	List<KeyGroupRange> keyGroupPartitions1,
	int cpSequenceNumber) throws Exception {

	// trigger the checkpoint
	coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();

	assertEquals(1, coord.getPendingCheckpoints().size());
	long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());

	for (int index = 0; index < jobVertex1.getParallelism(); index++) {

		KeyGroupRange keyGroupRange = keyGroupPartitions1.get(index);

		Map<StateHandleID, StreamStateHandle> privateState = new HashMap<>();
		privateState.put(
			new StateHandleID("private-1"),
			spy(new ByteStreamStateHandle("private-1", new byte[]{'p'})));

		Map<StateHandleID, StreamStateHandle> sharedState = new HashMap<>();

		// let all but the first CP overlap by one shared state.
		if (cpSequenceNumber > 0) {
			sharedState.put(
				new StateHandleID("shared-" + (cpSequenceNumber - 1)),
				spy(new PlaceholderStreamStateHandle()));
		}

		sharedState.put(
			new StateHandleID("shared-" + cpSequenceNumber),
			spy(new ByteStreamStateHandle("shared-" + cpSequenceNumber + "-" + keyGroupRange, new byte[]{'s'})));

		IncrementalRemoteKeyedStateHandle managedState =
			spy(new IncrementalRemoteKeyedStateHandle(
				new UUID(42L, 42L),
				keyGroupRange,
				checkpointId,
				sharedState,
				privateState,
				spy(new ByteStreamStateHandle("meta", new byte[]{'m'}))));

		OperatorSubtaskState operatorSubtaskState =
			spy(new OperatorSubtaskState(
				StateObjectCollection.empty(),
				StateObjectCollection.empty(),
				StateObjectCollection.singleton(managedState),
				StateObjectCollection.empty()));

		Map<OperatorID, OperatorSubtaskState> opStates = new HashMap<>();

		opStates.put(jobVertex1.getOperatorIDs().get(0).getGeneratedOperatorID(), operatorSubtaskState);

		TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot(opStates);

		AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
			jid,
			jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
			checkpointId,
			new CheckpointMetrics(),
			taskStateSnapshot);

		coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testMaxConcurrentAttempsWithSubsumption() {
	try {
		final int maxConcurrentAttempts = 2;
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinatorConfiguration chkConfig =
			new CheckpointCoordinatorConfigurationBuilder()
				.setCheckpointInterval(10) // periodic interval is 10 ms
				.setCheckpointTimeout(200000) // timeout is very long (200 s)
				.setMinPauseBetweenCheckpoints(0L) // no extra delay
				.setMaxConcurrentCheckpoints(maxConcurrentAttempts)
				.build();
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setCheckpointCoordinatorConfiguration(chkConfig)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		coord.startCheckpointScheduler();

		do {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}
		while (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);

		// validate that the pending checkpoints are there
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(1L));
		assertNotNull(coord.getPendingCheckpoints().get(2L));

		// now we acknowledge the second checkpoint, which should subsume the first checkpoint
		// and allow two more checkpoints to be triggered
		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L), TASK_MANAGER_LOCATION_INFO);

		// after a while, there should be the new checkpoints
		do {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}
		while (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);

		// do the final check
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(3L));
		assertNotNull(coord.getPendingCheckpoints().get(4L));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		final AtomicInteger numCalls = new AtomicInteger();

		final Execution execution = triggerVertex.getCurrentExecutionAttempt();

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).notifyCheckpointComplete(anyLong(), anyLong());

		CheckpointCoordinatorConfiguration chkConfig =
			new CheckpointCoordinatorConfigurationBuilder()
				.setCheckpointInterval(10) // periodic interval is 10 ms
				.setCheckpointTimeout(200000) // timeout is very long (200 s)
				.setMinPauseBetweenCheckpoints(0L) // no extra delay
				.setMaxConcurrentCheckpoints(maxConcurrentAttempts)
				.build();
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setCheckpointCoordinatorConfiguration(chkConfig)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		coord.startCheckpointScheduler();

		for (int i = 0; i < maxConcurrentAttempts; i++) {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}

		assertEquals(maxConcurrentAttempts, numCalls.get());

		verify(triggerVertex.getCurrentExecutionAttempt(), times(maxConcurrentAttempts))
				.triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 1L), TASK_MANAGER_LOCATION_INFO);

		final Collection<ScheduledFuture<?>> periodicScheduledTasks =
			manuallyTriggeredScheduledExecutor.getPeriodicScheduledTask();
		assertEquals(1, periodicScheduledTasks.size());
		final ScheduledFuture scheduledFuture = periodicScheduledTasks.iterator().next();

		manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
		manuallyTriggeredScheduledExecutor.triggerAll();

		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		// no further checkpoints should happen
		manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes
 * and subsumes the first checkpoint, but not the first savepoint. Then we
 * trigger another checkpoint and savepoint. The 2nd savepoint completes and
 * subsumes the last checkpoint, but not the first savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
	final JobID jid = new JobID();

	// create some mock Execution vertices that receive the checkpoint trigger messages
	final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
	final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
	ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
	ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

	StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord =
		new CheckpointCoordinatorBuilder()
			.setJobId(jid)
			.setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build())
			.setTasks(new ExecutionVertex[]{ vertex1, vertex2 })
			.setCheckpointIDCounter(counter)
			.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(10))
			.setTimer(manuallyTriggeredScheduledExecutor)
			.build();

	String savepointDir = tmpFolder.newFolder().getAbsolutePath();

	// Trigger savepoint and checkpoint
	CompletableFuture<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(savepointDir);

	manuallyTriggeredScheduledExecutor.triggerAll();
	long savepointId1 = counter.getLast();
	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> checkpointFuture1 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertEquals(2, coord.getNumberOfPendingCheckpoints());
	assertFalse(checkpointFuture1.isCompletedExceptionally());

	CompletableFuture<CompletedCheckpoint> checkpointFuture2 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture2.isCompletedExceptionally());
	long checkpointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());

	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
	assertFalse(savepointFuture1.isDone());

	CompletableFuture<CompletedCheckpoint> checkpointFuture3 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture3.isCompletedExceptionally());
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(savepointDir);
	manuallyTriggeredScheduledExecutor.triggerAll();
	long savepointId2 = counter.getLast();
	assertFalse(savepointFuture2.isCompletedExceptionally());
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());

	assertFalse(savepointFuture1.isDone());
	assertNotNull(savepointFuture2.get());

	// Ack first savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);

	assertEquals(0, coord.getNumberOfPendingCheckpoints());
	assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertNotNull(savepointFuture1.get());
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHandleMessagesForNonExistingCheckpoints() {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex1, ackVertex2 })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());

		long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

		// send some messages that do not belong to either the job or the any
		// of the vertices that need to be acknowledged.
		// non of the messages should throw an exception

		// wrong job id
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), ackAttemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);

		// unknown checkpoint
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, 1L), TASK_MANAGER_LOCATION_INFO);

		// unknown ack vertex
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, new ExecutionAttemptID(), checkpointId), TASK_MANAGER_LOCATION_INFO);

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testCheckpointTimeoutIsolated() {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();

		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();

		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);

		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);

		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		// set up the coordinator
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex1, ackVertex2 })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		// trigger a checkpoint, partially acknowledged
		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());
		assertEquals(1, coord.getNumberOfPendingCheckpoints());

		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().values().iterator().next();
		assertFalse(checkpoint.isDiscarded());

		OperatorID opID1 = OperatorID.fromJobVertexID(ackVertex1.getJobvertexId());

		TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
		OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
		taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);

		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);

		// triggers cancelling
		manuallyTriggeredScheduledExecutor.triggerScheduledTasks();
		assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDiscarded());
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// validate that the received states have been discarded
		verify(subtaskState1, times(1)).discardState();

		// no confirm message must have been sent
		verify(commitVertex.getCurrentExecutionAttempt(), times(0)).notifyCheckpointComplete(anyLong(), anyLong());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

/**
 * This test triggers a checkpoint and then sends a decline checkpoint message from
 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
 * checkpoint is triggered.
 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
	try {
		final JobID jid = new JobID();

		// create some mock Execution vertices that receive the checkpoint trigger messages
		final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
		ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
		ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

		// set up the coordinator and validate the initial state
		CheckpointCoordinator coord = getCheckpointCoordinator(jid, vertex1, vertex2);

		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// trigger the first checkpoint. this should succeed
		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());

		// validate that we have a pending checkpoint
		assertEquals(1, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// we have one task scheduled that will cancel after timeout
		assertEquals(1, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);

		assertNotNull(checkpoint);
		assertEquals(checkpointId, checkpoint.getCheckpointId());
		assertEquals(jid, checkpoint.getJobId());
		assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(0, checkpoint.getOperatorStates().size());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// check that the vertices received the trigger checkpoint message
		verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());
		verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());

		// acknowledge from one of the tasks
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// acknowledge the same task again (should not matter)
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// decline checkpoint from the other task, this should cancel the checkpoint
		// and trigger a new one
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		// the canceler is also removed
		assertEquals(0, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		// validate that we have no new pending checkpoint
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// decline again, nothing should happen
		// decline from the other task, nothing should happen
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testCheckpointTimeoutIsolated() {
	try {
		final JobID jid = new JobID();
		final long timestamp = System.currentTimeMillis();

		// create some mock execution vertices

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();

		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();

		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);

		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);

		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		// set up the coordinator
		// the timeout for the checkpoint is a 200 milliseconds

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			600000,
			200,
			0,
			Integer.MAX_VALUE,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex1, ackVertex2 },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		// trigger a checkpoint, partially acknowledged
		assertTrue(coord.triggerCheckpoint(timestamp, false));
		assertEquals(1, coord.getNumberOfPendingCheckpoints());

		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().values().iterator().next();
		assertFalse(checkpoint.isDiscarded());

		OperatorID opID1 = OperatorID.fromJobVertexID(ackVertex1.getJobvertexId());

		TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
		OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
		taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);

		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1));

		// wait until the checkpoint must have expired.
		// we check every 250 msecs conservatively for 5 seconds
		// to give even slow build servers a very good chance of completing this
		long deadline = System.currentTimeMillis() + 5000;
		do {
			Thread.sleep(250);
		}
		while (!checkpoint.isDiscarded() &&
				coord.getNumberOfPendingCheckpoints() > 0 &&
				System.currentTimeMillis() < deadline);

		assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDiscarded());
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// validate that the received states have been discarded
		verify(subtaskState1, times(1)).discardState();

		// no confirm message must have been sent
		verify(commitVertex.getCurrentExecutionAttempt(), times(0)).notifyCheckpointComplete(anyLong(), anyLong());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinator.java From flink with Apache License 2.0

4 votes

/**
 * Receives an AcknowledgeCheckpoint message and returns whether the
 * message was associated with a pending checkpoint.
 *
 * @param message Checkpoint ack from the task manager
 *
 * @param taskManagerLocationInfo The location of the acknowledge checkpoint message's sender
 * @return Flag indicating whether the ack'd checkpoint was associated
 * with a pending checkpoint.
 *
 * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint store.
 */
public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message, String taskManagerLocationInfo) throws CheckpointException {
	if (shutdown || message == null) {
		return false;
	}

	if (!job.equals(message.getJob())) {
		LOG.error("Received wrong AcknowledgeCheckpoint message for job {} from {} : {}", job, taskManagerLocationInfo, message);
		return false;
	}

	final long checkpointId = message.getCheckpointId();

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return false;
		}

		final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {

			switch (checkpoint.acknowledgeTask(message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics())) {
				case SUCCESS:
					LOG.debug("Received acknowledge message for checkpoint {} from task {} of job {} at {}.",
						checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					if (checkpoint.areTasksFullyAcknowledged()) {
						completePendingCheckpoint(checkpoint);
					}
					break;
				case DUPLICATE:
					LOG.debug("Received a duplicate acknowledge message for checkpoint {}, task {}, job {}, location {}.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
					break;
				case UNKNOWN:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {} at {}, " +
							"because the task's execution attempt id was unknown. Discarding " +
							"the state handle to avoid lingering state.", message.getCheckpointId(),
						message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

					break;
				case DISCARDED:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {} at {}, " +
						"because the pending checkpoint had been discarded. Discarding the " +
							"state handle tp avoid lingering state.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
			}

			return true;
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else {
			boolean wasPendingCheckpoint;

			// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
			if (recentPendingCheckpoints.contains(checkpointId)) {
				wasPendingCheckpoint = true;
				LOG.warn("Received late message for now expired checkpoint attempt {} from task " +
					"{} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
			}
			else {
				LOG.debug("Received message for an unknown checkpoint {} from task {} of job {} at {}.",
					checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
				wasPendingCheckpoint = false;
			}

			// try to discard the state so that we don't have lingering state lying around
			discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

			return wasPendingCheckpoint;
		}
	}
}

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testHandleMessagesForNonExistingCheckpoints() {
	try {
		final JobID jid = new JobID();
		final long timestamp = System.currentTimeMillis();

		// create some mock execution vertices and trigger some checkpoint

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			200000,
			200000,
			0,
			Integer.MAX_VALUE,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex1, ackVertex2 },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		assertTrue(coord.triggerCheckpoint(timestamp, false));

		long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

		// send some messages that do not belong to either the job or the any
		// of the vertices that need to be acknowledged.
		// non of the messages should throw an exception

		// wrong job id
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), ackAttemptID1, checkpointId));

		// unknown checkpoint
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, 1L));

		// unknown ack vertex
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, new ExecutionAttemptID(), checkpointId));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorFailureTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTest.mockExecutionVertex(executionAttemptId);

	final long triggerTimestamp = 1L;

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new StandaloneCheckpointIDCounter(),
		new FailingCompletedCheckpointStore(),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY);

	coord.triggerCheckpoint(triggerTimestamp, false);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage);
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
}

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		final AtomicInteger numCalls = new AtomicInteger();

		final Execution execution = triggerVertex.getCurrentExecutionAttempt();

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).notifyCheckpointComplete(anyLong(), anyLong());

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			10,        // periodic interval is 10 ms
			200000,    // timeout is very long (200 s)
			0L,        // no extra delay
			maxConcurrentAttempts,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		coord.startCheckpointScheduler();

		// after a while, there should be exactly as many checkpoints
		// as concurrently permitted
		long now = System.currentTimeMillis();
		long timeout = now + 60000;
		long minDuration = now + 100;
		do {
			Thread.sleep(20);
		}
		while ((now = System.currentTimeMillis()) < minDuration ||
				(numCalls.get() < maxConcurrentAttempts && now < timeout));

		assertEquals(maxConcurrentAttempts, numCalls.get());

		verify(triggerVertex.getCurrentExecutionAttempt(), times(maxConcurrentAttempts))
				.triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 1L));

		// this should have immediately triggered a new checkpoint
		now = System.currentTimeMillis();
		timeout = now + 60000;
		do {
			Thread.sleep(20);
		}
		while (numCalls.get() < maxConcurrentAttempts + 1 && now < timeout);

		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		// no further checkpoints should happen
		Thread.sleep(200);
		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Source File: CheckpointCoordinatorMasterHooksTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHooksAreCalledOnTrigger() throws Exception {
	final String id1 = "id1";
	final String id2 = "id2";

	final String state1 = "the-test-string-state";
	final byte[] state1serialized = new StringSerializer().serialize(state1);

	final long state2 = 987654321L;
	final byte[] state2serialized = new LongSerializer().serialize(state2);

	final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook1.getIdentifier()).thenReturn(id1);
	when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
	when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state1));

	final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook2.getIdentifier()).thenReturn(id2);
	when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
	when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state2));

	final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
	when(statelessHook.getIdentifier()).thenReturn("some-id");

	// create the checkpoint coordinator
	final JobID jid = new JobID();
	final ExecutionAttemptID execId = new ExecutionAttemptID();
	final ExecutionVertex ackVertex = mockExecutionVertex(execId);
	final CheckpointCoordinator cc = instantiateCheckpointCoordinator(jid, ackVertex);

	cc.addMasterHook(statefulHook1);
	cc.addMasterHook(statelessHook);
	cc.addMasterHook(statefulHook2);

	// trigger a checkpoint
	assertTrue(cc.triggerCheckpoint(System.currentTimeMillis(), false));
	assertEquals(1, cc.getNumberOfPendingCheckpoints());

	verify(statefulHook1, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statefulHook2, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statelessHook, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));

	final long checkpointId = cc.getPendingCheckpoints().values().iterator().next().getCheckpointId();
	cc.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, execId, checkpointId), "Unknown location");
	assertEquals(0, cc.getNumberOfPendingCheckpoints());

	assertEquals(1, cc.getNumberOfRetainedSuccessfulCheckpoints());
	final CompletedCheckpoint chk = cc.getCheckpointStore().getLatestCheckpoint(false);

	final Collection<MasterState> masterStates = chk.getMasterHookStates();
	assertEquals(2, masterStates.size());

	for (MasterState ms : masterStates) {
		if (ms.name().equals(id1)) {
			assertArrayEquals(state1serialized, ms.bytes());
			assertEquals(StringSerializer.VERSION, ms.version());
		}
		else if (ms.name().equals(id2)) {
			assertArrayEquals(state2serialized, ms.bytes());
			assertEquals(LongSerializer.VERSION, ms.version());
		}
		else {
			fail("unrecognized state name: " + ms.name());
		}
	}
}

Source File: CheckpointCoordinatorFailureTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTest.mockExecutionVertex(executionAttemptId);

	final long triggerTimestamp = 1L;

	CheckpointFailureManager failureManager = new CheckpointFailureManager(
		0,
		NoOpFailJobCall.INSTANCE);

	// set up the coordinator and validate the initial state
	CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration(
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true,
		false,
		0);
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		chkConfig,
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new StandaloneCheckpointIDCounter(),
		new FailingCompletedCheckpointStore(),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY,
		failureManager);

	coord.triggerCheckpoint(triggerTimestamp, false);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void performIncrementalCheckpoint(
	JobID jid,
	CheckpointCoordinator coord,
	ExecutionJobVertex jobVertex1,
	List<KeyGroupRange> keyGroupPartitions1,
	long timestamp,
	int cpSequenceNumber) throws Exception {

	// trigger the checkpoint
	coord.triggerCheckpoint(timestamp, false);

	assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
	long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());

	for (int index = 0; index < jobVertex1.getParallelism(); index++) {

		KeyGroupRange keyGroupRange = keyGroupPartitions1.get(index);

		Map<StateHandleID, StreamStateHandle> privateState = new HashMap<>();
		privateState.put(
			new StateHandleID("private-1"),
			spy(new ByteStreamStateHandle("private-1", new byte[]{'p'})));

		Map<StateHandleID, StreamStateHandle> sharedState = new HashMap<>();

		// let all but the first CP overlap by one shared state.
		if (cpSequenceNumber > 0) {
			sharedState.put(
				new StateHandleID("shared-" + (cpSequenceNumber - 1)),
				spy(new PlaceholderStreamStateHandle()));
		}

		sharedState.put(
			new StateHandleID("shared-" + cpSequenceNumber),
			spy(new ByteStreamStateHandle("shared-" + cpSequenceNumber + "-" + keyGroupRange, new byte[]{'s'})));

		IncrementalRemoteKeyedStateHandle managedState =
			spy(new IncrementalRemoteKeyedStateHandle(
				new UUID(42L, 42L),
				keyGroupRange,
				checkpointId,
				sharedState,
				privateState,
				spy(new ByteStreamStateHandle("meta", new byte[]{'m'}))));

		OperatorSubtaskState operatorSubtaskState =
			spy(new OperatorSubtaskState(
				StateObjectCollection.empty(),
				StateObjectCollection.empty(),
				StateObjectCollection.singleton(managedState),
				StateObjectCollection.empty()));

		Map<OperatorID, OperatorSubtaskState> opStates = new HashMap<>();

		opStates.put(jobVertex1.getOperatorIDs().get(0), operatorSubtaskState);

		TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot(opStates);

		AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
			jid,
			jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
			checkpointId,
			new CheckpointMetrics(),
			taskStateSnapshot);

		coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
	}
}

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testMaxConcurrentAttempsWithSubsumption() {
	try {
		final int maxConcurrentAttempts = 2;
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration(
			10,        // periodic interval is 10 ms
			200000,    // timeout is very long (200 s)
			0L,        // no extra delay
			maxConcurrentAttempts, // max two concurrent checkpoints
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			true,
			false,
			0);
		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			chkConfig,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY,
			failureManager);

		coord.startCheckpointScheduler();

		// after a while, there should be exactly as many checkpoints
		// as concurrently permitted
		long now = System.currentTimeMillis();
		long timeout = now + 60000;
		long minDuration = now + 100;
		do {
			Thread.sleep(20);
		}
		while ((now = System.currentTimeMillis()) < minDuration ||
				(coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts && now < timeout));

		// validate that the pending checkpoints are there
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(1L));
		assertNotNull(coord.getPendingCheckpoints().get(2L));

		// now we acknowledge the second checkpoint, which should subsume the first checkpoint
		// and allow two more checkpoints to be triggered
		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L), TASK_MANAGER_LOCATION_INFO);

		// after a while, there should be the new checkpoints
		final long newTimeout = System.currentTimeMillis() + 60000;
		do {
			Thread.sleep(20);
		}
		while (coord.getPendingCheckpoints().get(4L) == null &&
				System.currentTimeMillis() < newTimeout);

		// do the final check
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(3L));
		assertNotNull(coord.getPendingCheckpoints().get(4L));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint Java Examples