Search in sources :

Example 61 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.

the class RMContainerAllocator method getResources.

@SuppressWarnings("unchecked")
private List<Container> getResources() throws Exception {
    applyConcurrentTaskLimits();
    // will be null the first time
    Resource headRoom = Resources.clone(getAvailableResources());
    AllocateResponse response;
    /*
     * If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
     * milliseconds before aborting. During this interval, AM will still try
     * to contact the RM.
     */
    try {
        response = makeRemoteRequest();
        // Reset retry count if no exception occurred.
        retrystartTime = System.currentTimeMillis();
    } catch (ApplicationAttemptNotFoundException e) {
        // This can happen if the RM has been restarted. If it is in that state,
        // this application must clean itself up.
        eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
        throw new RMContainerAllocationException("Resource Manager doesn't recognize AttemptId: " + this.getContext().getApplicationAttemptId(), e);
    } catch (ApplicationMasterNotRegisteredException e) {
        LOG.info("ApplicationMaster is out of sync with ResourceManager," + " hence resync and send outstanding requests.");
        // RM may have restarted, re-register with RM.
        lastResponseID = 0;
        register();
        addOutstandingRequestOnResync();
        return null;
    } catch (InvalidLabelResourceRequestException e) {
        // If Invalid label exception is received means the requested label doesnt
        // have access so killing job in this case.
        String diagMsg = "Requested node-label-expression is invalid: " + StringUtils.stringifyException(e);
        LOG.info(diagMsg);
        JobId jobId = this.getJob().getID();
        eventHandler.handle(new JobDiagnosticsUpdateEvent(jobId, diagMsg));
        eventHandler.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
        throw e;
    } catch (Exception e) {
        // re-trying until the retryInterval has expired.
        if (System.currentTimeMillis() - retrystartTime >= retryInterval) {
            LOG.error("Could not contact RM after " + retryInterval + " milliseconds.");
            eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
            throw new RMContainerAllocationException("Could not contact RM after " + retryInterval + " milliseconds.");
        }
        // continue to attempt to contact the RM.
        throw e;
    }
    Resource newHeadRoom = getAvailableResources();
    List<Container> newContainers = response.getAllocatedContainers();
    // Setting NMTokens
    if (response.getNMTokens() != null) {
        for (NMToken nmToken : response.getNMTokens()) {
            NMTokenCache.setNMToken(nmToken.getNodeId().toString(), nmToken.getToken());
        }
    }
    // Setting AMRMToken
    if (response.getAMRMToken() != null) {
        updateAMRMToken(response.getAMRMToken());
    }
    List<ContainerStatus> finishedContainers = response.getCompletedContainersStatuses();
    // propagate preemption requests
    final PreemptionMessage preemptReq = response.getPreemptionMessage();
    if (preemptReq != null) {
        preemptionPolicy.preempt(new PreemptionContext(assignedRequests), preemptReq);
    }
    if (newContainers.size() + finishedContainers.size() > 0 || !headRoom.equals(newHeadRoom)) {
        //something changed
        recalculateReduceSchedule = true;
        if (LOG.isDebugEnabled() && !headRoom.equals(newHeadRoom)) {
            LOG.debug("headroom=" + newHeadRoom);
        }
    }
    if (LOG.isDebugEnabled()) {
        for (Container cont : newContainers) {
            LOG.debug("Received new Container :" + cont);
        }
    }
    //Called on each allocation. Will know about newly blacklisted/added hosts.
    computeIgnoreBlacklisting();
    handleUpdatedNodes(response);
    handleJobPriorityChange(response);
    // handle receiving the timeline collector address for this app
    String collectorAddr = response.getCollectorAddr();
    MRAppMaster.RunningAppContext appContext = (MRAppMaster.RunningAppContext) this.getContext();
    if (collectorAddr != null && !collectorAddr.isEmpty() && appContext.getTimelineV2Client() != null) {
        appContext.getTimelineV2Client().setTimelineServiceAddress(response.getCollectorAddr());
    }
    for (ContainerStatus cont : finishedContainers) {
        processFinishedContainer(cont);
    }
    return newContainers;
}
Also used : PreemptionMessage(org.apache.hadoop.yarn.api.records.PreemptionMessage) MRAppMaster(org.apache.hadoop.mapreduce.v2.app.MRAppMaster) NMToken(org.apache.hadoop.yarn.api.records.NMToken) Resource(org.apache.hadoop.yarn.api.records.Resource) JobDiagnosticsUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobDiagnosticsUpdateEvent) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) IOException(java.io.IOException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) ApplicationMasterNotRegisteredException(org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException) Container(org.apache.hadoop.yarn.api.records.Container) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) JobEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent) InvalidLabelResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId)

Example 62 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project flink by apache.

the class YarnFlinkResourceManager method containersComplete.

/**
	 * Invoked when the ResourceManager informs of completed containers.
	 * Called via an actor message by the callback from the ResourceManager client.
	 * 
	 * @param containers The containers that have completed.
	 */
private void containersComplete(List<ContainerStatus> containers) {
    for (ContainerStatus status : containers) {
        final ResourceID id = new ResourceID(status.getContainerId().toString());
        // check if this is a failed container or a completed container
        if (containersBeingReturned.remove(status.getContainerId()) != null) {
            // regular completed container that we released
            LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics());
        } else {
            // failed container, either at startup, or running
            final String exitStatus;
            switch(status.getExitStatus()) {
                case -103:
                    exitStatus = "Vmem limit exceeded (-103)";
                    break;
                case -104:
                    exitStatus = "Pmem limit exceeded (-104)";
                    break;
                default:
                    exitStatus = String.valueOf(status.getExitStatus());
            }
            final YarnContainerInLaunch launched = containersInLaunch.remove(id);
            if (launched != null) {
                LOG.info("Container {} failed, with a TaskManager in launch or registration. " + "Exit status: {}", id, exitStatus);
            // we will trigger re-acquiring new containers at the end
            } else {
                // failed registered worker
                LOG.info("Container {} failed. Exit status: {}", id, exitStatus);
                // notify the generic logic, which notifies the JobManager, etc.
                notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
            }
            // general failure logging
            failedContainersSoFar++;
            String diagMessage = String.format("Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id, status.getState(), exitStatus, status.getDiagnostics());
            sendInfoMessage(diagMessage);
            LOG.info(diagMessage);
            LOG.info("Total number of failed containers so far: " + failedContainersSoFar);
            // maxFailedContainers == -1 is infinite number of retries.
            if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
                String msg = "Stopping YARN session because the number of failed containers (" + failedContainersSoFar + ") exceeded the maximum failed containers (" + maxFailedContainers + "). This number is controlled by the '" + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. " + "By default its the number of requested containers.";
                LOG.error(msg);
                self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
                // no need to do anything else
                return;
            }
        }
    }
    updateProgress();
    // in case failed containers were among the finished containers, make
    // sure we re-examine and request new ones
    triggerCheckWorkers();
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) StopCluster(org.apache.flink.runtime.clusterframework.messages.StopCluster)

Example 63 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project drill by axbaretto.

the class ClusterControllerImpl method containersCompleted.

@Override
public synchronized void containersCompleted(List<ContainerStatus> statuses) {
    EventContext context = new EventContext(this);
    for (ContainerStatus status : statuses) {
        Task task = getTask(status.getContainerId());
        if (task == null) {
            if (task == null) {
                // Will occur if a container was allocated but rejected.
                // Any other occurrence is unexpected and an error.
                LOG.warn("Container completed but no associated task state: " + status.getContainerId());
            }
            continue;
        }
        context.setTask(task);
        context.getState().containerCompleted(context, status);
    }
    checkStatus();
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus)

Example 64 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project incubator-myriad by apache.

the class TestObjectFactory method getRMStatusEvent.

public static RMNodeStatusEvent getRMStatusEvent(RMNode node) {
    NodeId id = node.getNodeID();
    NodeHealthStatus hStatus = NodeHealthStatusPBImpl.newInstance(true, "HEALTHY", System.currentTimeMillis());
    List<ContainerStatus> cStatus = Lists.newArrayList(getContainerStatus(node));
    List<ApplicationId> keepAliveIds = Lists.newArrayList(getApplicationId(node.getHttpPort()));
    NodeHeartbeatResponse response = new NodeHeartbeatResponsePBImpl();
    return new RMNodeStatusEvent(id, hStatus, cStatus, keepAliveIds, response);
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) RMNodeStatusEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent) NodeId(org.apache.hadoop.yarn.api.records.NodeId) NodeHeartbeatResponsePBImpl(org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) NodeHealthStatus(org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)

Example 65 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project incubator-myriad by apache.

the class NMHeartBeatHandlerTest method getRMStatusEvent.

private RMNodeStatusEvent getRMStatusEvent(RMNode node) {
    NodeId id = node.getNodeID();
    NodeHealthStatus hStatus = NodeHealthStatusPBImpl.newInstance(true, "HEALTHY", System.currentTimeMillis());
    List<ContainerStatus> cStatus = Lists.newArrayList(getContainerStatus(node));
    List<ApplicationId> keepAliveIds = Lists.newArrayList(getApplicationId(node.getHttpPort()));
    NodeHeartbeatResponse response = new NodeHeartbeatResponsePBImpl();
    return new RMNodeStatusEvent(id, hStatus, cStatus, keepAliveIds, response);
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) RMNodeStatusEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent) NodeId(org.apache.hadoop.yarn.api.records.NodeId) NodeHeartbeatResponsePBImpl(org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) NodeHealthStatus(org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)

Aggregations

ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)144 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)76 Test (org.junit.Test)75 ArrayList (java.util.ArrayList)58 Container (org.apache.hadoop.yarn.api.records.Container)40 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)28 NodeId (org.apache.hadoop.yarn.api.records.NodeId)26 HashMap (java.util.HashMap)25 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)25 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)23 Configuration (org.apache.hadoop.conf.Configuration)21 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)21 Resource (org.apache.hadoop.yarn.api.records.Resource)21 GetContainerStatusesRequest (org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest)20 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)20 StartContainerRequest (org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest)19 StartContainersRequest (org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest)18 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)18 AllocateResponse (org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse)17 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)14