Java Code Examples for org.apache.hadoop.yarn.api.records.ContainerStatus#getExitStatus()
The following examples show how to use
org.apache.hadoop.yarn.api.records.ContainerStatus#getExitStatus() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TaskSchedulerManager.java From tez with Apache License 2.0 | 6 votes |
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) { // SchedulerId isn't used here since no node updates are sent out // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = "Container completed. "; TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR; } else if (exitStatus != ContainerExitStatus.SUCCESS){ message = "Container failed, exitCode=" + exitStatus + ". "; } if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause)); } }
Example 2
Source File: TaskSchedulerEventHandler.java From incubator-tez with Apache License 2.0 | 6 votes |
@Override public synchronized void containerCompleted(Object task, ContainerStatus containerStatus) { // Inform the Containers about completion. AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId()); if (amContainer != null) { String message = null; int exitStatus = containerStatus.getExitStatus(); if (exitStatus == ContainerExitStatus.PREEMPTED) { message = "Container preempted externally. "; } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) { message = "Container disk failed. "; } else { message = "Container failed. "; } if (containerStatus.getDiagnostics() != null) { message += containerStatus.getDiagnostics(); } sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message)); } }
Example 3
Source File: YarnService.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(ContainerStatus containerStatus) { ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>(); eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString()); eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE, containerStatus.getState().toString()); if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS, containerStatus.getExitStatus() + ""); } if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS, containerStatus.getDiagnostics()); } return eventMetadataBuilder; }
Example 4
Source File: ContainerInfo.java From hadoop with Apache License 2.0 | 5 votes |
public ContainerInfo(final Context nmContext, final Container container, String requestUri, String pathPrefix) { this.id = container.getContainerId().toString(); this.nodeId = nmContext.getNodeId().toString(); ContainerStatus containerData = container.cloneAndGetContainerStatus(); this.exitCode = containerData.getExitStatus(); this.exitStatus = (this.exitCode == ContainerExitStatus.INVALID) ? "N/A" : String.valueOf(exitCode); this.state = container.getContainerState().toString(); this.diagnostics = containerData.getDiagnostics(); if (this.diagnostics == null || this.diagnostics.isEmpty()) { this.diagnostics = ""; } this.user = container.getUser(); Resource res = container.getResource(); if (res != null) { this.totalMemoryNeededMB = res.getMemory(); this.totalVCoresNeeded = res.getVirtualCores(); } this.containerLogsShortLink = ujoin("containerlogs", this.id, container.getUser()); if (requestUri == null) { requestUri = ""; } if (pathPrefix == null) { pathPrefix = ""; } this.containerLogsLink = join(requestUri, pathPrefix, this.containerLogsShortLink); }
Example 5
Source File: YarnManager.java From Scribengin with GNU Affero General Public License v3.0 | 5 votes |
public void onContainersCompleted(List<ContainerStatus> statuses) { logger.info("Start onContainersCompleted(List<ContainerStatus> statuses)"); for (ContainerStatus status: statuses) { assert (status.getState() == ContainerState.COMPLETE); int exitStatus = status.getExitStatus(); //TODO: update vm descriptor status if (exitStatus != ContainerExitStatus.SUCCESS) { } else { } } logger.info("Finish onContainersCompleted(List<ContainerStatus> statuses)"); }
Example 6
Source File: AbstractApplicationMaster.java From Scribengin with GNU Affero General Public License v3.0 | 5 votes |
public void onContainersCompleted(List<ContainerStatus> statuses) { LOG.info("onContainersCompleted"); for (ContainerStatus status : statuses) { assert (status.getState() == ContainerState.COMPLETE); int exitStatus = status.getExitStatus(); if (exitStatus != ContainerExitStatus.SUCCESS) { if (exitStatus != ContainerExitStatus.ABORTED) { failedContainerCount.incrementAndGet(); } allocatedContainerCount.decrementAndGet(); requestedContainerCount.decrementAndGet(); recordFailedCommand(status.getContainerId()); } else { completedContainerCount.incrementAndGet(); } } int askAgainCount = totalContainerCount - requestedContainerCount.get(); requestedContainerCount.addAndGet(askAgainCount); if (askAgainCount > 0) { // need to reallocate failed containers for (int i = 0; i < askAgainCount; i++) { ContainerRequest req = setupContainerReqForRM(); resourceManager.addContainerRequest(req); } } if (completedContainerCount.get() == totalContainerCount) { done = true; } }
Example 7
Source File: RMContainerAllocator.java From big-c with Apache License 2.0 | 5 votes |
@VisibleForTesting public TaskAttemptEvent createContainerFinishedEvent(ContainerStatus cont, TaskAttemptId attemptID) { if (cont.getExitStatus() == ContainerExitStatus.ABORTED || cont.getExitStatus() == ContainerExitStatus.PREEMPTED) { // killed by framework return new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_KILL); } else { return new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_CONTAINER_COMPLETED); } }
Example 8
Source File: RMAppAttemptImpl.java From big-c with Apache License 2.0 | 5 votes |
private void setAMContainerCrashedDiagnosticsAndExitStatus( RMAppAttemptContainerFinishedEvent finishEvent) { ContainerStatus status = finishEvent.getContainerStatus(); String diagnostics = getAMContainerCrashedDiagnostics(finishEvent); this.diagnostics.append(diagnostics); this.amContainerExitStatus = status.getExitStatus(); }
Example 9
Source File: ContainerInfo.java From big-c with Apache License 2.0 | 5 votes |
public ContainerInfo(final Context nmContext, final Container container, String requestUri, String pathPrefix) { this.id = container.getContainerId().toString(); this.nodeId = nmContext.getNodeId().toString(); ContainerStatus containerData = container.cloneAndGetContainerStatus(); this.exitCode = containerData.getExitStatus(); this.exitStatus = (this.exitCode == ContainerExitStatus.INVALID) ? "N/A" : String.valueOf(exitCode); this.state = container.getContainerState().toString(); this.diagnostics = containerData.getDiagnostics(); if (this.diagnostics == null || this.diagnostics.isEmpty()) { this.diagnostics = ""; } this.user = container.getUser(); Resource res = container.getResource(); if (res != null) { this.totalMemoryNeededMB = res.getMemory(); this.totalVCoresNeeded = res.getVirtualCores(); } this.containerLogsShortLink = ujoin("containerlogs", this.id, container.getUser()); if (requestUri == null) { requestUri = ""; } if (pathPrefix == null) { pathPrefix = ""; } this.containerLogsLink = join(requestUri, pathPrefix, this.containerLogsShortLink); }
Example 10
Source File: RMContainerAllocator.java From hadoop with Apache License 2.0 | 5 votes |
@VisibleForTesting public TaskAttemptEvent createContainerFinishedEvent(ContainerStatus cont, TaskAttemptId attemptID) { if (cont.getExitStatus() == ContainerExitStatus.ABORTED || cont.getExitStatus() == ContainerExitStatus.PREEMPTED) { // killed by framework return new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_KILL); } else { return new TaskAttemptEvent(attemptID, TaskAttemptEventType.TA_CONTAINER_COMPLETED); } }
Example 11
Source File: RMAppAttemptImpl.java From hadoop with Apache License 2.0 | 5 votes |
private void setAMContainerCrashedDiagnosticsAndExitStatus( RMAppAttemptContainerFinishedEvent finishEvent) { ContainerStatus status = finishEvent.getContainerStatus(); String diagnostics = getAMContainerCrashedDiagnostics(finishEvent); this.diagnostics.append(diagnostics); this.amContainerExitStatus = status.getExitStatus(); }
Example 12
Source File: ApplicationMaster.java From big-c with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info(appAttemptID + " got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason numAllocatedContainers.decrementAndGet(); numRequestedContainers.decrementAndGet(); // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully numCompletedContainers.incrementAndGet(); LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } if(timelineClient != null) { publishContainerEndEvent( timelineClient, containerStatus, domainId, appSubmitterUgi); } } // ask for more containers if any failed int askCount = numTotalContainers - numRequestedContainers.get(); numRequestedContainers.addAndGet(askCount); if (askCount > 0) { for (int i = 0; i < askCount; ++i) { ContainerRequest containerAsk = setupContainerAskForRM(); amRMClient.addContainerRequest(containerAsk); } } if (numCompletedContainers.get() == numTotalContainers) { done = true; } }
Example 13
Source File: ResourceSchedulerWrapper.java From big-c with Apache License 2.0 | 4 votes |
private void updateQueueWithNodeUpdate( NodeUpdateSchedulerEventWrapper eventWrapper) { RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode(); List<UpdatedContainerInfo> containerList = node.getContainerUpdates(); for (UpdatedContainerInfo info : containerList) { for (ContainerStatus status : info.getCompletedContainers()) { ContainerId containerId = status.getContainerId(); SchedulerAppReport app = scheduler.getSchedulerAppInfo( containerId.getApplicationAttemptId()); if (app == null) { // this happens for the AM container // The app have already removed when the NM sends the release // information. continue; } String queue = appQueueMap.get(containerId.getApplicationAttemptId() .getApplicationId()); int releasedMemory = 0, releasedVCores = 0; if (status.getExitStatus() == ContainerExitStatus.SUCCESS) { for (RMContainer rmc : app.getLiveContainers()) { if (rmc.getContainerId() == containerId) { releasedMemory += rmc.getContainer().getResource().getMemory(); releasedVCores += rmc.getContainer() .getResource().getVirtualCores(); break; } } } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) { if (preemptionContainerMap.containsKey(containerId)) { Resource preResource = preemptionContainerMap.get(containerId); releasedMemory += preResource.getMemory(); releasedVCores += preResource.getVirtualCores(); preemptionContainerMap.remove(containerId); } } // update queue counters updateQueueMetrics(queue, releasedMemory, releasedVCores); } } }
Example 14
Source File: YarnService.java From incubator-gobblin with Apache License 2.0 | 4 votes |
/** * Handle the completion of a container. A new container will be requested to replace the one * that just exited. Depending on the exit status and if container host affinity is enabled, * the new container may or may not try to be started on the same node. * * A container completes in either of the following conditions: 1) some error happens in the * container and caused the container to exit, 2) the container gets killed due to some reason, * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster. * A replacement container is needed in all but the last case. */ protected void handleContainerCompletion(ContainerStatus containerStatus) { Map.Entry<Container, String> completedContainerEntry = this.containerMap.remove(containerStatus.getContainerId()); //Get the Helix instance name for the completed container. Because callbacks are processed asynchronously, we might //encounter situations where handleContainerCompletion() is called before onContainersAllocated(), resulting in the //containerId missing from the containersMap. String completedInstanceName = completedContainerEntry == null? UNKNOWN_HELIX_INSTANCE : completedContainerEntry.getValue(); LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d", containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus())); if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) { LOGGER.info(String.format("Received the following diagnostics information for container %s: %s", containerStatus.getContainerId(), containerStatus.getDiagnostics())); } if (containerStatus.getExitStatus() == ContainerExitStatus.ABORTED) { if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) { LOGGER.info("Container release requested, so not spawning a replacement for containerId {}", containerStatus.getContainerId()); if (completedContainerEntry != null) { LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName); this.unusedHelixInstanceNames.add(completedInstanceName); } return; } else { LOGGER.info("Container {} aborted due to lost NM", containerStatus.getContainerId()); // Container release was not requested. Likely, the container was running on a node on which the NM died. // In this case, RM assumes that the containers are "lost", even though the container process may still be // running on the node. We need to ensure that the Helix instances running on the orphaned containers // are fenced off from the Helix cluster to avoid double publishing and state being committed by the // instances. if (!UNKNOWN_HELIX_INSTANCE.equals(completedInstanceName)) { String clusterName = this.helixManager.getClusterName(); //Disable the orphaned instance. if (HelixUtils.isInstanceLive(helixManager, completedInstanceName)) { LOGGER.info("Disabling the Helix instance {}", completedInstanceName); this.helixManager.getClusterManagmentTool().enableInstance(clusterName, completedInstanceName, false); } } } } if (this.shutdownInProgress) { return; } if(completedContainerEntry != null) { this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)); int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet(); // Populate event metadata Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent(); if (this.eventSubmitter.isPresent()) { eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus)); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName); eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + ""); } if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) { if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get() .submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName); return; } // Add the Helix instance name of the completed container to the set of unused // instance names so they can be reused by a replacement container. LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName); this.unusedHelixInstanceNames.add(completedInstanceName); if (this.eventSubmitter.isPresent()) { this.eventSubmitter.get() .submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build()); } } LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s", containerStatus.getContainerId(), completedInstanceName)); this.eventBus.post(new NewContainerRequest( shouldStickToTheSameNode(containerStatus.getExitStatus()) && completedContainerEntry != null ? Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent())); }
Example 15
Source File: ResourceSchedulerWrapper.java From hadoop with Apache License 2.0 | 4 votes |
private void updateQueueWithNodeUpdate( NodeUpdateSchedulerEventWrapper eventWrapper) { RMNodeWrapper node = (RMNodeWrapper) eventWrapper.getRMNode(); List<UpdatedContainerInfo> containerList = node.getContainerUpdates(); for (UpdatedContainerInfo info : containerList) { for (ContainerStatus status : info.getCompletedContainers()) { ContainerId containerId = status.getContainerId(); SchedulerAppReport app = scheduler.getSchedulerAppInfo( containerId.getApplicationAttemptId()); if (app == null) { // this happens for the AM container // The app have already removed when the NM sends the release // information. continue; } String queue = appQueueMap.get(containerId.getApplicationAttemptId() .getApplicationId()); int releasedMemory = 0, releasedVCores = 0; if (status.getExitStatus() == ContainerExitStatus.SUCCESS) { for (RMContainer rmc : app.getLiveContainers()) { if (rmc.getContainerId() == containerId) { releasedMemory += rmc.getContainer().getResource().getMemory(); releasedVCores += rmc.getContainer() .getResource().getVirtualCores(); break; } } } else if (status.getExitStatus() == ContainerExitStatus.ABORTED) { if (preemptionContainerMap.containsKey(containerId)) { Resource preResource = preemptionContainerMap.get(containerId); releasedMemory += preResource.getMemory(); releasedVCores += preResource.getVirtualCores(); preemptionContainerMap.remove(containerId); } } // update queue counters updateQueueMetrics(queue, releasedMemory, releasedVCores); } } }
Example 16
Source File: ApplicationMaster.java From hadoop with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info(appAttemptID + " got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed numCompletedContainers.incrementAndGet(); numFailedContainers.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason numAllocatedContainers.decrementAndGet(); numRequestedContainers.decrementAndGet(); // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully numCompletedContainers.incrementAndGet(); LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } if(timelineClient != null) { publishContainerEndEvent( timelineClient, containerStatus, domainId, appSubmitterUgi); } } // ask for more containers if any failed int askCount = numTotalContainers - numRequestedContainers.get(); numRequestedContainers.addAndGet(askCount); if (askCount > 0) { for (int i = 0; i < askCount; ++i) { ContainerRequest containerAsk = setupContainerAskForRM(); amRMClient.addContainerRequest(containerAsk); } } if (numCompletedContainers.get() == numTotalContainers) { done = true; } }
Example 17
Source File: ApplicationMaster.java From TensorFlowOnYARN with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Override public void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); for (ContainerStatus containerStatus : completedContainers) { LOG.info(appAttemptId + " got container status for containerID=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics()); // non complete containers should not be here assert (containerStatus.getState() == ContainerState.COMPLETE); // ignore containers we know nothing about - probably from a previous // attempt if (!launchedContainers.contains(containerStatus.getContainerId())) { LOG.info("Ignoring completed status of " + containerStatus.getContainerId() + "; unknown container(probably launched by previous attempt)"); continue; } // increment counters for completed/failed containers int exitStatus = containerStatus.getExitStatus(); if (0 != exitStatus) { // container failed if (ContainerExitStatus.ABORTED != exitStatus) { // shell script failed // counts as completed completedContainerNum.incrementAndGet(); failedContainerNum.incrementAndGet(); } else { // container was killed by framework, possibly preempted // we should re-try as the container was lost for some reason allocatedContainerNum.decrementAndGet(); requestedContainerNum.decrementAndGet(); // we do not need to release the container as it would be done // by the RM } } else { // nothing to do // container completed successfully completedContainerNum.incrementAndGet(); LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId()); } } // ask for more containers if any failed int askCount = args.totalContainerNum - requestedContainerNum.get(); requestedContainerNum.addAndGet(askCount); if (askCount > 0) { for (int i = 0; i < askCount; ++i) { ContainerRequest containerAsk = setupContainerAskForRM(); amRMClient.addContainerRequest(containerAsk); } } if (completedContainerNum.get() == args.totalContainerNum) { done = true; } }