use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class RMContainerAllocator method getResources.
@SuppressWarnings("unchecked")
private List<Container> getResources() throws Exception {
applyConcurrentTaskLimits();
// will be null the first time
Resource headRoom = Resources.clone(getAvailableResources());
AllocateResponse response;
/*
* If contact with RM is lost, the AM will wait MR_AM_TO_RM_WAIT_INTERVAL_MS
* milliseconds before aborting. During this interval, AM will still try
* to contact the RM.
*/
try {
response = makeRemoteRequest();
// Reset retry count if no exception occurred.
retrystartTime = System.currentTimeMillis();
} catch (ApplicationAttemptNotFoundException e) {
// This can happen if the RM has been restarted. If it is in that state,
// this application must clean itself up.
eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
throw new RMContainerAllocationException("Resource Manager doesn't recognize AttemptId: " + this.getContext().getApplicationAttemptId(), e);
} catch (ApplicationMasterNotRegisteredException e) {
LOG.info("ApplicationMaster is out of sync with ResourceManager," + " hence resync and send outstanding requests.");
// RM may have restarted, re-register with RM.
lastResponseID = 0;
register();
addOutstandingRequestOnResync();
return null;
} catch (InvalidLabelResourceRequestException e) {
// If Invalid label exception is received means the requested label doesnt
// have access so killing job in this case.
String diagMsg = "Requested node-label-expression is invalid: " + StringUtils.stringifyException(e);
LOG.info(diagMsg);
JobId jobId = this.getJob().getID();
eventHandler.handle(new JobDiagnosticsUpdateEvent(jobId, diagMsg));
eventHandler.handle(new JobEvent(jobId, JobEventType.JOB_KILL));
throw e;
} catch (Exception e) {
// re-trying until the retryInterval has expired.
if (System.currentTimeMillis() - retrystartTime >= retryInterval) {
LOG.error("Could not contact RM after " + retryInterval + " milliseconds.");
eventHandler.handle(new JobEvent(this.getJob().getID(), JobEventType.JOB_AM_REBOOT));
throw new RMContainerAllocationException("Could not contact RM after " + retryInterval + " milliseconds.");
}
// continue to attempt to contact the RM.
throw e;
}
Resource newHeadRoom = getAvailableResources();
List<Container> newContainers = response.getAllocatedContainers();
// Setting NMTokens
if (response.getNMTokens() != null) {
for (NMToken nmToken : response.getNMTokens()) {
NMTokenCache.setNMToken(nmToken.getNodeId().toString(), nmToken.getToken());
}
}
// Setting AMRMToken
if (response.getAMRMToken() != null) {
updateAMRMToken(response.getAMRMToken());
}
List<ContainerStatus> finishedContainers = response.getCompletedContainersStatuses();
// propagate preemption requests
final PreemptionMessage preemptReq = response.getPreemptionMessage();
if (preemptReq != null) {
preemptionPolicy.preempt(new PreemptionContext(assignedRequests), preemptReq);
}
if (newContainers.size() + finishedContainers.size() > 0 || !headRoom.equals(newHeadRoom)) {
//something changed
recalculateReduceSchedule = true;
if (LOG.isDebugEnabled() && !headRoom.equals(newHeadRoom)) {
LOG.debug("headroom=" + newHeadRoom);
}
}
if (LOG.isDebugEnabled()) {
for (Container cont : newContainers) {
LOG.debug("Received new Container :" + cont);
}
}
//Called on each allocation. Will know about newly blacklisted/added hosts.
computeIgnoreBlacklisting();
handleUpdatedNodes(response);
handleJobPriorityChange(response);
// handle receiving the timeline collector address for this app
String collectorAddr = response.getCollectorAddr();
MRAppMaster.RunningAppContext appContext = (MRAppMaster.RunningAppContext) this.getContext();
if (collectorAddr != null && !collectorAddr.isEmpty() && appContext.getTimelineV2Client() != null) {
appContext.getTimelineV2Client().setTimelineServiceAddress(response.getCollectorAddr());
}
for (ContainerStatus cont : finishedContainers) {
processFinishedContainer(cont);
}
return newContainers;
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project flink by apache.
the class YarnFlinkResourceManager method containersComplete.
/**
* Invoked when the ResourceManager informs of completed containers.
* Called via an actor message by the callback from the ResourceManager client.
*
* @param containers The containers that have completed.
*/
private void containersComplete(List<ContainerStatus> containers) {
for (ContainerStatus status : containers) {
final ResourceID id = new ResourceID(status.getContainerId().toString());
// check if this is a failed container or a completed container
if (containersBeingReturned.remove(status.getContainerId()) != null) {
// regular completed container that we released
LOG.info("Container {} completed successfully with diagnostics: {}", id, status.getDiagnostics());
} else {
// failed container, either at startup, or running
final String exitStatus;
switch(status.getExitStatus()) {
case -103:
exitStatus = "Vmem limit exceeded (-103)";
break;
case -104:
exitStatus = "Pmem limit exceeded (-104)";
break;
default:
exitStatus = String.valueOf(status.getExitStatus());
}
final YarnContainerInLaunch launched = containersInLaunch.remove(id);
if (launched != null) {
LOG.info("Container {} failed, with a TaskManager in launch or registration. " + "Exit status: {}", id, exitStatus);
// we will trigger re-acquiring new containers at the end
} else {
// failed registered worker
LOG.info("Container {} failed. Exit status: {}", id, exitStatus);
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
}
// general failure logging
failedContainersSoFar++;
String diagMessage = String.format("Diagnostics for container %s in state %s : " + "exitStatus=%s diagnostics=%s", id, status.getState(), exitStatus, status.getDiagnostics());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed containers so far: " + failedContainersSoFar);
// maxFailedContainers == -1 is infinite number of retries.
if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
String msg = "Stopping YARN session because the number of failed containers (" + failedContainersSoFar + ") exceeded the maximum failed containers (" + maxFailedContainers + "). This number is controlled by the '" + ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. " + "By default its the number of requested containers.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)), ActorRef.noSender());
// no need to do anything else
return;
}
}
}
updateProgress();
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project drill by axbaretto.
the class ClusterControllerImpl method containersCompleted.
@Override
public synchronized void containersCompleted(List<ContainerStatus> statuses) {
EventContext context = new EventContext(this);
for (ContainerStatus status : statuses) {
Task task = getTask(status.getContainerId());
if (task == null) {
if (task == null) {
// Will occur if a container was allocated but rejected.
// Any other occurrence is unexpected and an error.
LOG.warn("Container completed but no associated task state: " + status.getContainerId());
}
continue;
}
context.setTask(task);
context.getState().containerCompleted(context, status);
}
checkStatus();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project incubator-myriad by apache.
the class TestObjectFactory method getRMStatusEvent.
public static RMNodeStatusEvent getRMStatusEvent(RMNode node) {
NodeId id = node.getNodeID();
NodeHealthStatus hStatus = NodeHealthStatusPBImpl.newInstance(true, "HEALTHY", System.currentTimeMillis());
List<ContainerStatus> cStatus = Lists.newArrayList(getContainerStatus(node));
List<ApplicationId> keepAliveIds = Lists.newArrayList(getApplicationId(node.getHttpPort()));
NodeHeartbeatResponse response = new NodeHeartbeatResponsePBImpl();
return new RMNodeStatusEvent(id, hStatus, cStatus, keepAliveIds, response);
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project incubator-myriad by apache.
the class NMHeartBeatHandlerTest method getRMStatusEvent.
private RMNodeStatusEvent getRMStatusEvent(RMNode node) {
NodeId id = node.getNodeID();
NodeHealthStatus hStatus = NodeHealthStatusPBImpl.newInstance(true, "HEALTHY", System.currentTimeMillis());
List<ContainerStatus> cStatus = Lists.newArrayList(getContainerStatus(node));
List<ApplicationId> keepAliveIds = Lists.newArrayList(getApplicationId(node.getHttpPort()));
NodeHeartbeatResponse response = new NodeHeartbeatResponsePBImpl();
return new RMNodeStatusEvent(id, hStatus, cStatus, keepAliveIds, response);
}
Aggregations