Java Code Examples for org.apache.flink.runtime.jobgraph.JobStatus#RESTARTING
The following examples show how to use
org.apache.flink.runtime.jobgraph.JobStatus#RESTARTING .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExecutionGraph.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public void cancel() { assertRunningInJobMasterMainThread(); while (true) { JobStatus current = state; if (current == JobStatus.RUNNING || current == JobStatus.CREATED) { if (transitionState(current, JobStatus.CANCELLING)) { // make sure no concurrent local actions interfere with the cancellation final long globalVersionForRestart = incrementGlobalModVersion(); final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture; // cancel ongoing scheduling action if (ongoingSchedulingFuture != null) { ongoingSchedulingFuture.cancel(false); } final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(verticesInCreationOrder.size()); // cancel all tasks (that still need cancelling) for (ExecutionJobVertex ejv : verticesInCreationOrder) { futures.add(ejv.cancelWithFuture()); } // we build a future that is complete once all vertices have reached a terminal state final ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures); allTerminal.whenComplete( (Void value, Throwable throwable) -> { if (throwable != null) { transitionState( JobStatus.CANCELLING, JobStatus.FAILED, new FlinkException( "Could not cancel job " + getJobName() + " because not all execution job vertices could be cancelled.", throwable)); } else { // cancellations may currently be overridden by failures which trigger // restarts, so we need to pass a proper restart global version here allVerticesInTerminalState(globalVersionForRestart); } }); return; } } // Executions are being canceled. Go into cancelling and wait for // all vertices to be in their final state. else if (current == JobStatus.FAILING) { if (transitionState(current, JobStatus.CANCELLING)) { return; } } // All vertices have been cancelled and it's safe to directly go // into the canceled state. else if (current == JobStatus.RESTARTING) { synchronized (progressLock) { if (transitionState(current, JobStatus.CANCELED)) { onTerminalState(JobStatus.CANCELED); LOG.info("Canceled during restart."); return; } } } else { // no need to treat other states return; } } }
Example 2
Source File: ExecutionGraph.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public void restart(long expectedGlobalVersion) { assertRunningInJobMasterMainThread(); try { synchronized (progressLock) { // check the global version to see whether this recovery attempt is still valid if (globalModVersion != expectedGlobalVersion) { LOG.info("Concurrent full restart subsumed this restart."); return; } final JobStatus current = state; if (current == JobStatus.CANCELED) { LOG.info("Canceled job during restart. Aborting restart."); return; } else if (current == JobStatus.FAILED) { LOG.info("Failed job during restart. Aborting restart."); return; } else if (current == JobStatus.SUSPENDED) { LOG.info("Suspended job during restart. Aborting restart."); return; } else if (current != JobStatus.RESTARTING) { throw new IllegalStateException("Can only restart job from state restarting."); } this.currentExecutions.clear(); final Collection<CoLocationGroup> colGroups = new HashSet<>(); final long resetTimestamp = System.currentTimeMillis(); for (ExecutionJobVertex jv : this.verticesInCreationOrder) { CoLocationGroup cgroup = jv.getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } jv.resetForNewExecution(resetTimestamp, expectedGlobalVersion); } for (int i = 0; i < stateTimestamps.length; i++) { if (i != JobStatus.RESTARTING.ordinal()) { // Only clear the non restarting state in order to preserve when the job was // restarted. This is needed for the restarting time gauge stateTimestamps[i] = 0; } } transitionState(JobStatus.RESTARTING, JobStatus.CREATED); // if we have checkpointed state, reload it into the executions if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false); } } scheduleForExecution(); } catch (Throwable t) { LOG.warn("Failed to restart the job.", t); failGlobal(t); } }
Example 3
Source File: ExecutionGraph.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then * try to fail the job. This operation is only permitted if the current state is FAILING or * RESTARTING. * * @return true if the operation could be executed; false if a concurrent job status change occurred */ private boolean tryRestartOrFail(long globalModVersionForRestart) { JobStatus currentState = state; if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) { final Throwable failureCause = this.failureCause; synchronized (progressLock) { if (LOG.isDebugEnabled()) { LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause); } else { LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID()); } final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException); final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart(); boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart; if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) { LOG.info("Restarting the job {} ({}).", getJobName(), getJobID()); RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart); restartStrategy.restart(restarter, getJobMasterMainThreadExecutor()); return true; } else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) { final String cause1 = isFailureCauseAllowingRestart ? null : "a type of SuppressRestartsException was thrown"; final String cause2 = isRestartStrategyAllowingRestart ? null : "the restart strategy prevented it"; LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.concatenateWithAnd(cause1, cause2), failureCause); onTerminalState(JobStatus.FAILED); return true; } else { // we must have changed the state concurrently, thus we cannot complete this operation return false; } } } else { // this operation is only allowed in the state FAILING or RESTARTING return false; } }
Example 4
Source File: ExecutionGraph.java From flink with Apache License 2.0 | 4 votes |
public void cancel() { assertRunningInJobMasterMainThread(); while (true) { JobStatus current = state; if (current == JobStatus.RUNNING || current == JobStatus.CREATED) { if (transitionState(current, JobStatus.CANCELLING)) { // make sure no concurrent local actions interfere with the cancellation final long globalVersionForRestart = incrementGlobalModVersion(); final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture; // cancel ongoing scheduling action if (ongoingSchedulingFuture != null) { ongoingSchedulingFuture.cancel(false); } final ConjunctFuture<Void> allTerminal = cancelVerticesAsync(); allTerminal.whenComplete( (Void value, Throwable throwable) -> { if (throwable != null) { transitionState( JobStatus.CANCELLING, JobStatus.FAILED, new FlinkException( "Could not cancel job " + getJobName() + " because not all execution job vertices could be cancelled.", throwable)); } else { // cancellations may currently be overridden by failures which trigger // restarts, so we need to pass a proper restart global version here allVerticesInTerminalState(globalVersionForRestart); } }); return; } } // Executions are being canceled. Go into cancelling and wait for // all vertices to be in their final state. else if (current == JobStatus.FAILING) { if (transitionState(current, JobStatus.CANCELLING)) { return; } } // All vertices have been cancelled and it's safe to directly go // into the canceled state. else if (current == JobStatus.RESTARTING) { synchronized (progressLock) { if (transitionState(current, JobStatus.CANCELED)) { onTerminalState(JobStatus.CANCELED); LOG.info("Canceled during restart."); return; } } } else { // no need to treat other states return; } } }
Example 5
Source File: ExecutionGraph.java From flink with Apache License 2.0 | 4 votes |
public void restart(long expectedGlobalVersion) { assertRunningInJobMasterMainThread(); try { synchronized (progressLock) { // check the global version to see whether this recovery attempt is still valid if (globalModVersion != expectedGlobalVersion) { LOG.info("Concurrent full restart subsumed this restart."); return; } final JobStatus current = state; if (current == JobStatus.CANCELED) { LOG.info("Canceled job during restart. Aborting restart."); return; } else if (current == JobStatus.FAILED) { LOG.info("Failed job during restart. Aborting restart."); return; } else if (current == JobStatus.SUSPENDED) { LOG.info("Suspended job during restart. Aborting restart."); return; } else if (current != JobStatus.RESTARTING) { throw new IllegalStateException("Can only restart job from state restarting."); } this.currentExecutions.clear(); final Collection<CoLocationGroup> colGroups = new HashSet<>(); final long resetTimestamp = System.currentTimeMillis(); for (ExecutionJobVertex jv : this.verticesInCreationOrder) { CoLocationGroup cgroup = jv.getCoLocationGroup(); if (cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } jv.resetForNewExecution(resetTimestamp, expectedGlobalVersion); } for (int i = 0; i < stateTimestamps.length; i++) { if (i != JobStatus.RESTARTING.ordinal()) { // Only clear the non restarting state in order to preserve when the job was // restarted. This is needed for the restarting time gauge stateTimestamps[i] = 0; } } transitionState(JobStatus.RESTARTING, JobStatus.CREATED); // if we have checkpointed state, reload it into the executions if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false); } } scheduleForExecution(); } // TODO remove the catch block if we align the schematics to not fail global within the restarter. catch (Throwable t) { LOG.warn("Failed to restart the job.", t); failGlobal(t); } }
Example 6
Source File: ExecutionGraph.java From flink with Apache License 2.0 | 4 votes |
/** * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then * try to fail the job. This operation is only permitted if the current state is FAILING or * RESTARTING. * * @return true if the operation could be executed; false if a concurrent job status change occurred */ private boolean tryRestartOrFail(long globalModVersionForRestart) { JobStatus currentState = state; if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) { final Throwable failureCause = this.failureCause; synchronized (progressLock) { if (LOG.isDebugEnabled()) { LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause); } else { LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID()); } final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException); final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart(); boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart; if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) { LOG.info("Restarting the job {} ({}).", getJobName(), getJobID()); RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart); FutureUtils.assertNoException( restartStrategy .restart(restarter, getJobMasterMainThreadExecutor()) .exceptionally((throwable) -> { failGlobal(throwable); return null; })); return true; } else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) { final String cause1 = isFailureCauseAllowingRestart ? null : "a type of SuppressRestartsException was thrown"; final String cause2 = isRestartStrategyAllowingRestart ? null : "the restart strategy prevented it"; LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(), StringUtils.concatenateWithAnd(cause1, cause2), failureCause); onTerminalState(JobStatus.FAILED); return true; } else { // we must have changed the state concurrently, thus we cannot complete this operation return false; } } } else { // this operation is only allowed in the state FAILING or RESTARTING return false; } }