Java Code Examples for org.apache.flink.runtime.jobgraph.JobStatus#SUSPENDED
The following examples show how to use
org.apache.flink.runtime.jobgraph.JobStatus#SUSPENDED .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CompletedCheckpoint.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public boolean discardOnShutdown(JobStatus jobStatus) throws Exception { if (jobStatus == JobStatus.FINISHED && props.discardOnJobFinished() || jobStatus == JobStatus.CANCELED && props.discardOnJobCancelled() || jobStatus == JobStatus.FAILED && props.discardOnJobFailed() || jobStatus == JobStatus.SUSPENDED && props.discardOnJobSuspended()) { doDiscard(); return true; } else { LOG.info("Checkpoint with ID {} at '{}' not discarded.", checkpointID, externalPointer); return false; } }
Example 2
Source File: CompletedCheckpoint.java From flink with Apache License 2.0 | 5 votes |
public boolean discardOnShutdown(JobStatus jobStatus) throws Exception { if (jobStatus == JobStatus.FINISHED && props.discardOnJobFinished() || jobStatus == JobStatus.CANCELED && props.discardOnJobCancelled() || jobStatus == JobStatus.FAILED && props.discardOnJobFailed() || jobStatus == JobStatus.SUSPENDED && props.discardOnJobSuspended()) { doDiscard(); return true; } else { LOG.info("Checkpoint with ID {} at '{}' not discarded.", checkpointID, externalPointer); return false; } }
Example 3
Source File: ExecutionGraph.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Fails the execution graph globally. This failure will not be recovered by a specific * failover strategy, but results in a full restart of all tasks. * * <p>This global failure is meant to be triggered in cases where the consistency of the * execution graph' state cannot be guaranteed any more (for example when catching unexpected * exceptions that indicate a bug or an unexpected call race), and where a full restart is the * safe way to get consistency back. * * @param t The exception that caused the failure. */ public void failGlobal(Throwable t) { assertRunningInJobMasterMainThread(); while (true) { JobStatus current = state; // stay in these states if (current == JobStatus.FAILING || current == JobStatus.SUSPENDED || current.isGloballyTerminalState()) { return; } else if (transitionState(current, JobStatus.FAILING, t)) { initFailureCause(t); // make sure no concurrent local or global actions interfere with the failover final long globalVersionForRestart = incrementGlobalModVersion(); final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture; // cancel ongoing scheduling action if (ongoingSchedulingFuture != null) { ongoingSchedulingFuture.cancel(false); } // we build a future that is complete once all vertices have reached a terminal state final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(verticesInCreationOrder.size()); // cancel all tasks (that still need cancelling) for (ExecutionJobVertex ejv : verticesInCreationOrder) { futures.add(ejv.cancelWithFuture()); } final ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.whenComplete( (Void ignored, Throwable throwable) -> { if (throwable != null) { transitionState( JobStatus.FAILING, JobStatus.FAILED, new FlinkException("Could not cancel all execution job vertices properly.", throwable)); } else { allVerticesInTerminalState(globalVersionForRestart); } }); return; } // else: concurrent change to execution state, retry } }
Example 4
Source File: ExecutionGraph.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public void restart(long expectedGlobalVersion) { assertRunningInJobMasterMainThread(); try { synchronized (progressLock) { // check the global version to see whether this recovery attempt is still valid if (globalModVersion != expectedGlobalVersion) { LOG.info("Concurrent full restart subsumed this restart."); return; } final JobStatus current = state; if (current == JobStatus.CANCELED) { LOG.info("Canceled job during restart. Aborting restart."); return; } else if (current == JobStatus.FAILED) { LOG.info("Failed job during restart. Aborting restart."); return; } else if (current == JobStatus.SUSPENDED) { LOG.info("Suspended job during restart. Aborting restart."); return; } else if (current != JobStatus.RESTARTING) { throw new IllegalStateException("Can only restart job from state restarting."); } this.currentExecutions.clear(); final Collection<CoLocationGroup> colGroups = new HashSet<>(); final long resetTimestamp = System.currentTimeMillis(); for (ExecutionJobVertex jv : this.verticesInCreationOrder) { CoLocationGroup cgroup = jv.getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } jv.resetForNewExecution(resetTimestamp, expectedGlobalVersion); } for (int i = 0; i < stateTimestamps.length; i++) { if (i != JobStatus.RESTARTING.ordinal()) { // Only clear the non restarting state in order to preserve when the job was // restarted. This is needed for the restarting time gauge stateTimestamps[i] = 0; } } transitionState(JobStatus.RESTARTING, JobStatus.CREATED); // if we have checkpointed state, reload it into the executions if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false); } } scheduleForExecution(); } catch (Throwable t) { LOG.warn("Failed to restart the job.", t); failGlobal(t); } }
Example 5
Source File: CompletedCheckpointTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests that the garbage collection properties are respected when shutting down. */ @Test public void testCleanUpOnShutdown() throws Exception { JobStatus[] terminalStates = new JobStatus[] { JobStatus.FINISHED, JobStatus.CANCELED, JobStatus.FAILED, JobStatus.SUSPENDED }; for (JobStatus status : terminalStates) { OperatorState state = mock(OperatorState.class); Map<OperatorID, OperatorState> operatorStates = new HashMap<>(); operatorStates.put(new OperatorID(), state); EmptyStreamStateHandle retainedHandle = new EmptyStreamStateHandle(); TestCompletedCheckpointStorageLocation retainedLocation = new TestCompletedCheckpointStorageLocation(retainedHandle, "ptr"); // Keep CheckpointProperties retainProps = new CheckpointProperties(false, CheckpointType.CHECKPOINT, false, false, false, false, false); CompletedCheckpoint checkpoint = new CompletedCheckpoint( new JobID(), 0, 0, 1, new HashMap<>(operatorStates), Collections.emptyList(), retainProps, retainedLocation); checkpoint.discardOnShutdown(status); verify(state, times(0)).discardState(); assertFalse(retainedLocation.isDisposed()); assertFalse(retainedHandle.isDisposed()); // Discard EmptyStreamStateHandle discardHandle = new EmptyStreamStateHandle(); TestCompletedCheckpointStorageLocation discardLocation = new TestCompletedCheckpointStorageLocation(discardHandle, "ptr"); // Keep CheckpointProperties discardProps = new CheckpointProperties(false, CheckpointType.CHECKPOINT, true, true, true, true, true); checkpoint = new CompletedCheckpoint( new JobID(), 0, 0, 1, new HashMap<>(operatorStates), Collections.emptyList(), discardProps, discardLocation); checkpoint.discardOnShutdown(status); verify(state, times(1)).discardState(); assertTrue(discardLocation.isDisposed()); assertTrue(discardHandle.isDisposed()); } }
Example 6
Source File: ExecutionGraph.java From flink with Apache License 2.0 | 4 votes |
/** * Fails the execution graph globally. This failure will not be recovered by a specific * failover strategy, but results in a full restart of all tasks. * * <p>This global failure is meant to be triggered in cases where the consistency of the * execution graph' state cannot be guaranteed any more (for example when catching unexpected * exceptions that indicate a bug or an unexpected call race), and where a full restart is the * safe way to get consistency back. * * @param t The exception that caused the failure. */ public void failGlobal(Throwable t) { assertRunningInJobMasterMainThread(); while (true) { JobStatus current = state; // stay in these states if (current == JobStatus.FAILING || current == JobStatus.SUSPENDED || current.isGloballyTerminalState()) { return; } else if (transitionState(current, JobStatus.FAILING, t)) { initFailureCause(t); // make sure no concurrent local or global actions interfere with the failover final long globalVersionForRestart = incrementGlobalModVersion(); final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture; // cancel ongoing scheduling action if (ongoingSchedulingFuture != null) { ongoingSchedulingFuture.cancel(false); } // we build a future that is complete once all vertices have reached a terminal state final ConjunctFuture<Void> allTerminal = cancelVerticesAsync(); FutureUtils.assertNoException(allTerminal.handle( (Void ignored, Throwable throwable) -> { if (throwable != null) { transitionState( JobStatus.FAILING, JobStatus.FAILED, new FlinkException("Could not cancel all execution job vertices properly.", throwable)); } else { allVerticesInTerminalState(globalVersionForRestart); } return null; })); return; } // else: concurrent change to execution state, retry } }
Example 7
Source File: ExecutionGraph.java From flink with Apache License 2.0 | 4 votes |
public void restart(long expectedGlobalVersion) { assertRunningInJobMasterMainThread(); try { synchronized (progressLock) { // check the global version to see whether this recovery attempt is still valid if (globalModVersion != expectedGlobalVersion) { LOG.info("Concurrent full restart subsumed this restart."); return; } final JobStatus current = state; if (current == JobStatus.CANCELED) { LOG.info("Canceled job during restart. Aborting restart."); return; } else if (current == JobStatus.FAILED) { LOG.info("Failed job during restart. Aborting restart."); return; } else if (current == JobStatus.SUSPENDED) { LOG.info("Suspended job during restart. Aborting restart."); return; } else if (current != JobStatus.RESTARTING) { throw new IllegalStateException("Can only restart job from state restarting."); } this.currentExecutions.clear(); final Collection<CoLocationGroup> colGroups = new HashSet<>(); final long resetTimestamp = System.currentTimeMillis(); for (ExecutionJobVertex jv : this.verticesInCreationOrder) { CoLocationGroup cgroup = jv.getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } jv.resetForNewExecution(resetTimestamp, expectedGlobalVersion); } for (int i = 0; i < stateTimestamps.length; i++) { if (i != JobStatus.RESTARTING.ordinal()) { // Only clear the non restarting state in order to preserve when the job was // restarted. This is needed for the restarting time gauge stateTimestamps[i] = 0; } } transitionState(JobStatus.RESTARTING, JobStatus.CREATED); // if we have checkpointed state, reload it into the executions if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false); } } scheduleForExecution(); } // TODO remove the catch block if we align the schematics to not fail global within the restarter. catch (Throwable t) { LOG.warn("Failed to restart the job.", t); failGlobal(t); } }
Example 8
Source File: CompletedCheckpointTest.java From flink with Apache License 2.0 | 4 votes |
/** * Tests that the garbage collection properties are respected when shutting down. */ @Test public void testCleanUpOnShutdown() throws Exception { JobStatus[] terminalStates = new JobStatus[] { JobStatus.FINISHED, JobStatus.CANCELED, JobStatus.FAILED, JobStatus.SUSPENDED }; for (JobStatus status : terminalStates) { OperatorState state = mock(OperatorState.class); Map<OperatorID, OperatorState> operatorStates = new HashMap<>(); operatorStates.put(new OperatorID(), state); EmptyStreamStateHandle retainedHandle = new EmptyStreamStateHandle(); TestCompletedCheckpointStorageLocation retainedLocation = new TestCompletedCheckpointStorageLocation(retainedHandle, "ptr"); // Keep CheckpointProperties retainProps = new CheckpointProperties(false, CheckpointType.CHECKPOINT, false, false, false, false, false); CompletedCheckpoint checkpoint = new CompletedCheckpoint( new JobID(), 0, 0, 1, new HashMap<>(operatorStates), Collections.emptyList(), retainProps, retainedLocation); checkpoint.discardOnShutdown(status); verify(state, times(0)).discardState(); assertFalse(retainedLocation.isDisposed()); assertFalse(retainedHandle.isDisposed()); // Discard EmptyStreamStateHandle discardHandle = new EmptyStreamStateHandle(); TestCompletedCheckpointStorageLocation discardLocation = new TestCompletedCheckpointStorageLocation(discardHandle, "ptr"); // Keep CheckpointProperties discardProps = new CheckpointProperties(false, CheckpointType.CHECKPOINT, true, true, true, true, true); checkpoint = new CompletedCheckpoint( new JobID(), 0, 0, 1, new HashMap<>(operatorStates), Collections.emptyList(), discardProps, discardLocation); checkpoint.discardOnShutdown(status); verify(state, times(1)).discardState(); assertTrue(discardLocation.isDisposed()); assertTrue(discardHandle.isDisposed()); } }