use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.
the class DagAwareYarnTaskScheduler method onContainersCompleted.
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
if (stopRequested) {
return;
}
List<TaskStatus> taskStatusList = new ArrayList<>(statuses.size());
synchronized (this) {
for (ContainerStatus status : statuses) {
ContainerId cid = status.getContainerId();
LOG.info("Container {} completed with status {}", cid, status);
Object task = releasedContainers.remove(cid);
if (task == null) {
HeldContainer hc = heldContainers.get(cid);
if (hc != null) {
task = containerCompleted(hc);
}
}
if (task != null) {
taskStatusList.add(new TaskStatus(task, status));
}
}
}
// perform app callback outside of locks
for (TaskStatus taskStatus : taskStatusList) {
getContext().containerCompleted(taskStatus.task, taskStatus.status);
}
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.
the class TestTaskScheduler method testTaskSchedulerNoReuse.
@SuppressWarnings({ "unchecked" })
@Test(timeout = 10000)
public void testTaskSchedulerNoReuse() throws Exception {
AMRMClientAsyncForTest mockRMClient = spy(new AMRMClientAsyncForTest(new AMRMClientForTest(), 100));
Configuration conf = new Configuration();
conf.setBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_ENABLED, false);
int interval = 100;
conf.setInt(TezConfiguration.TEZ_AM_RM_HEARTBEAT_INTERVAL_MS_MAX, interval);
TaskSchedulerContext mockApp = setupMockTaskSchedulerContext(DEFAULT_APP_HOST, DEFAULT_APP_PORT, DEFAULT_APP_URL, conf);
TaskSchedulerContextDrainable drainableAppCallback = createDrainableContext(mockApp);
TaskSchedulerWithDrainableContext scheduler = new TaskSchedulerWithDrainableContext(drainableAppCallback, mockRMClient);
scheduler.initialize();
drainableAppCallback.drain();
// Verifying the validity of the configuration via the interval only instead of making sure
// it's the same instance.
verify(mockRMClient).setHeartbeatInterval(interval);
scheduler.start();
drainableAppCallback.drain();
verify(mockRMClient).start();
verify(mockRMClient).registerApplicationMaster(DEFAULT_APP_HOST, DEFAULT_APP_PORT, DEFAULT_APP_URL);
RegisterApplicationMasterResponse regResponse = mockRMClient.getRegistrationResponse();
verify(mockApp).setApplicationRegistrationData(regResponse.getMaximumResourceCapability(), regResponse.getApplicationACLs(), regResponse.getClientToAMTokenMasterKey(), regResponse.getQueue());
Assert.assertEquals(scheduler.getClusterNodeCount(), mockRMClient.getClusterNodeCount());
Object mockTask1 = mock(Object.class);
Object mockCookie1 = mock(Object.class);
Resource mockCapability = mock(Resource.class);
String[] hosts = { "host1", "host5" };
String[] racks = { "/default-rack", "/default-rack" };
Priority mockPriority = mock(Priority.class);
ArgumentCaptor<CookieContainerRequest> requestCaptor = ArgumentCaptor.forClass(CookieContainerRequest.class);
// allocate task
scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
drainableAppCallback.drain();
verify(mockRMClient, times(1)).addContainerRequest((CookieContainerRequest) any());
// returned from task requests before allocation happens
assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
verify(mockApp, times(0)).containerBeingReleased(any(ContainerId.class));
verify(mockRMClient, times(1)).removeContainerRequest((CookieContainerRequest) any());
verify(mockRMClient, times(0)).releaseAssignedContainer((ContainerId) any());
// deallocating unknown task
assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
verify(mockApp, times(0)).containerBeingReleased(any(ContainerId.class));
verify(mockRMClient, times(1)).removeContainerRequest((CookieContainerRequest) any());
verify(mockRMClient, times(0)).releaseAssignedContainer((ContainerId) any());
// allocate tasks
Object mockTask2 = mock(Object.class);
Object mockCookie2 = mock(Object.class);
Object mockTask3 = mock(Object.class);
Object mockCookie3 = mock(Object.class);
scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
drainableAppCallback.drain();
verify(mockRMClient, times(2)).addContainerRequest(requestCaptor.capture());
CookieContainerRequest request1 = requestCaptor.getValue();
scheduler.allocateTask(mockTask2, mockCapability, hosts, racks, mockPriority, null, mockCookie2);
drainableAppCallback.drain();
verify(mockRMClient, times(3)).addContainerRequest(requestCaptor.capture());
CookieContainerRequest request2 = requestCaptor.getValue();
scheduler.allocateTask(mockTask3, mockCapability, hosts, racks, mockPriority, null, mockCookie3);
drainableAppCallback.drain();
verify(mockRMClient, times(4)).addContainerRequest(requestCaptor.capture());
CookieContainerRequest request3 = requestCaptor.getValue();
List<Container> containers = new ArrayList<Container>();
Container mockContainer1 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer1.getNodeId().getHost()).thenReturn("host1");
ContainerId mockCId1 = mock(ContainerId.class);
when(mockContainer1.getId()).thenReturn(mockCId1);
containers.add(mockContainer1);
Container mockContainer2 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer2.getNodeId().getHost()).thenReturn("host2");
ContainerId mockCId2 = mock(ContainerId.class);
when(mockContainer2.getId()).thenReturn(mockCId2);
containers.add(mockContainer2);
Container mockContainer3 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer3.getNodeId().getHost()).thenReturn("host3");
ContainerId mockCId3 = mock(ContainerId.class);
when(mockContainer3.getId()).thenReturn(mockCId3);
containers.add(mockContainer3);
Container mockContainer4 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer4.getNodeId().getHost()).thenReturn("host4");
ContainerId mockCId4 = mock(ContainerId.class);
when(mockContainer4.getId()).thenReturn(mockCId4);
containers.add(mockContainer4);
scheduler.onContainersAllocated(containers);
drainableAppCallback.drain();
// first container allocated
verify(mockApp).taskAllocated(mockTask1, mockCookie1, mockContainer1);
verify(mockApp).taskAllocated(mockTask2, mockCookie2, mockContainer2);
verify(mockApp).taskAllocated(mockTask3, mockCookie3, mockContainer3);
// no other allocations returned
verify(mockApp, times(3)).taskAllocated(any(), any(), (Container) any());
verify(mockRMClient).removeContainerRequest(request1);
verify(mockRMClient).removeContainerRequest(request2);
verify(mockRMClient).removeContainerRequest(request3);
// verify unwanted container released
verify(mockRMClient).releaseAssignedContainer(mockCId4);
// deallocate allocated task
assertTrue(scheduler.deallocateTask(mockTask1, true, null, null));
drainableAppCallback.drain();
verify(mockApp).containerBeingReleased(mockCId1);
verify(mockRMClient).releaseAssignedContainer(mockCId1);
// deallocate allocated container
Assert.assertEquals(mockTask2, scheduler.deallocateContainer(mockCId2));
drainableAppCallback.drain();
verify(mockRMClient).releaseAssignedContainer(mockCId2);
verify(mockRMClient, times(3)).releaseAssignedContainer((ContainerId) any());
List<ContainerStatus> statuses = new ArrayList<ContainerStatus>();
ContainerStatus mockStatus1 = mock(ContainerStatus.class);
when(mockStatus1.getContainerId()).thenReturn(mockCId1);
statuses.add(mockStatus1);
ContainerStatus mockStatus2 = mock(ContainerStatus.class);
when(mockStatus2.getContainerId()).thenReturn(mockCId2);
statuses.add(mockStatus2);
ContainerStatus mockStatus3 = mock(ContainerStatus.class);
when(mockStatus3.getContainerId()).thenReturn(mockCId3);
statuses.add(mockStatus3);
ContainerStatus mockStatus4 = mock(ContainerStatus.class);
when(mockStatus4.getContainerId()).thenReturn(mockCId4);
statuses.add(mockStatus4);
scheduler.onContainersCompleted(statuses);
drainableAppCallback.drain();
// released container status returned
verify(mockApp).containerCompleted(mockTask1, mockStatus1);
verify(mockApp).containerCompleted(mockTask2, mockStatus2);
// currently allocated container status returned and not released
verify(mockApp).containerCompleted(mockTask3, mockStatus3);
// no other statuses returned
verify(mockApp, times(3)).containerCompleted(any(), (ContainerStatus) any());
verify(mockRMClient, times(3)).releaseAssignedContainer((ContainerId) any());
// verify blacklisting
verify(mockRMClient, times(0)).addNodeToBlacklist((NodeId) any());
String badHost = "host6";
NodeId badNodeId = mock(NodeId.class);
when(badNodeId.getHost()).thenReturn(badHost);
scheduler.blacklistNode(badNodeId);
verify(mockRMClient, times(1)).addNodeToBlacklist(badNodeId);
Object mockTask4 = mock(Object.class);
Object mockCookie4 = mock(Object.class);
scheduler.allocateTask(mockTask4, mockCapability, null, null, mockPriority, null, mockCookie4);
drainableAppCallback.drain();
verify(mockRMClient, times(5)).addContainerRequest(requestCaptor.capture());
Container mockContainer5 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer5.getNodeId().getHost()).thenReturn(badHost);
when(mockContainer5.getNodeId()).thenReturn(badNodeId);
ContainerId mockCId5 = mock(ContainerId.class);
when(mockContainer5.getId()).thenReturn(mockCId5);
containers.clear();
containers.add(mockContainer5);
scheduler.onContainersAllocated(containers);
drainableAppCallback.drain();
// no new allocation
verify(mockApp, times(3)).taskAllocated(any(), any(), (Container) any());
// verify blacklisted container released
verify(mockRMClient).releaseAssignedContainer(mockCId5);
verify(mockRMClient, times(4)).releaseAssignedContainer((ContainerId) any());
// verify request added back
verify(mockRMClient, times(6)).addContainerRequest(requestCaptor.capture());
Container mockContainer6 = mock(Container.class, RETURNS_DEEP_STUBS);
when(mockContainer6.getNodeId().getHost()).thenReturn("host7");
ContainerId mockCId6 = mock(ContainerId.class);
when(mockContainer6.getId()).thenReturn(mockCId6);
containers.clear();
containers.add(mockContainer6);
scheduler.onContainersAllocated(containers);
drainableAppCallback.drain();
// new allocation
verify(mockApp, times(4)).taskAllocated(any(), any(), (Container) any());
verify(mockApp).taskAllocated(mockTask4, mockCookie4, mockContainer6);
// deallocate allocated task
assertTrue(scheduler.deallocateTask(mockTask4, true, null, null));
drainableAppCallback.drain();
verify(mockApp).containerBeingReleased(mockCId6);
verify(mockRMClient).releaseAssignedContainer(mockCId6);
verify(mockRMClient, times(5)).releaseAssignedContainer((ContainerId) any());
// test unblacklist
scheduler.unblacklistNode(badNodeId);
verify(mockRMClient, times(1)).removeNodeFromBlacklist(badNodeId);
assertEquals(0, scheduler.blacklistedNodes.size());
float progress = 0.5f;
when(mockApp.getProgress()).thenReturn(progress);
Assert.assertEquals(progress, scheduler.getProgress(), 0);
// check duplicate allocation request
scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
drainableAppCallback.drain();
verify(mockRMClient, times(7)).addContainerRequest((CookieContainerRequest) any());
verify(mockRMClient, times(6)).removeContainerRequest((CookieContainerRequest) any());
scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
drainableAppCallback.drain();
// old request removed and new one added
verify(mockRMClient, times(7)).removeContainerRequest((CookieContainerRequest) any());
verify(mockRMClient, times(8)).addContainerRequest((CookieContainerRequest) any());
assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
List<NodeReport> mockUpdatedNodes = mock(List.class);
scheduler.onNodesUpdated(mockUpdatedNodes);
drainableAppCallback.drain();
verify(mockApp).nodesUpdated(mockUpdatedNodes);
ArgumentCaptor<String> argumentCaptor = ArgumentCaptor.forClass(String.class);
Exception mockException = new IOException("mockexception");
scheduler.onError(mockException);
drainableAppCallback.drain();
verify(mockApp).reportError(eq(YarnTaskSchedulerServiceError.RESOURCEMANAGER_ERROR), argumentCaptor.capture(), any(DagInfo.class));
assertTrue(argumentCaptor.getValue().contains("mockexception"));
scheduler.onShutdownRequest();
drainableAppCallback.drain();
verify(mockApp).appShutdownRequested();
AppFinalStatus finalStatus = new AppFinalStatus(FinalApplicationStatus.SUCCEEDED, SUCCEED_APP_MESSAGE, DEFAULT_APP_URL);
when(mockApp.getFinalAppStatus()).thenReturn(finalStatus);
scheduler.shutdown();
drainableAppCallback.drain();
verify(mockRMClient).unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, SUCCEED_APP_MESSAGE, DEFAULT_APP_URL);
verify(mockRMClient).stop();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.
the class TestTaskSchedulerManager method testContainerDiskFailed.
@Test(timeout = 5000)
public void testContainerDiskFailed() throws IOException {
Configuration conf = new Configuration(false);
schedulerHandler.init(conf);
schedulerHandler.start();
String diagnostics = "NM disk failed.";
TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
ContainerStatus mockStatus = mock(ContainerStatus.class);
ContainerId mockCId = mock(ContainerId.class);
AMContainer mockAMContainer = mock(AMContainer.class);
when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
when(mockAMContainer.getContainerId()).thenReturn(mockCId);
when(mockStatus.getContainerId()).thenReturn(mockCId);
when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED);
schedulerHandler.containerCompleted(0, mockTask, mockStatus);
assertEquals(1, mockEventHandler.events.size());
Event event = mockEventHandler.events.get(0);
assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
assertEquals(mockCId, completedEvent.getContainerId());
assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics());
Assert.assertFalse(completedEvent.isPreempted());
assertTrue(completedEvent.isDiskFailed());
assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause());
schedulerHandler.stop();
schedulerHandler.close();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.
the class YarnTaskSchedulerService method onContainersCompleted.
// AMRMClientAsync interface methods
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
if (isStopStarted.get()) {
if (LOG.isDebugEnabled()) {
for (ContainerStatus status : statuses) {
LOG.debug("Container " + status.getContainerId() + " is completed with ContainerStatus=" + status);
}
}
return;
}
Map<Object, ContainerStatus> appContainerStatus = new HashMap<Object, ContainerStatus>(statuses.size());
synchronized (this) {
for (ContainerStatus containerStatus : statuses) {
ContainerId completedId = containerStatus.getContainerId();
HeldContainer delayedContainer = heldContainers.get(completedId);
Object task = releasedContainers.remove(completedId);
if (task != null) {
if (delayedContainer != null) {
LOG.warn("Held container should be null since releasedContainer is not");
}
// an allocated container completed. notify app
if (LOG.isDebugEnabled()) {
LOG.debug("Released container completed:" + completedId + " last allocated to task: " + task);
}
appContainerStatus.put(task, containerStatus);
continue;
}
// not found in released containers. check currently allocated containers
// no need to release this container as the RM has already completed it
task = unAssignContainer(completedId, false);
if (delayedContainer != null) {
heldContainers.remove(completedId);
Resources.subtract(allocatedResources, delayedContainer.getContainer().getResource());
} else {
LOG.warn("Held container expected to be not null for a non-AM-released container");
}
if (task != null) {
// completion of a container we have allocated currently
// an allocated container completed. notify app. This will cause attempt to get killed
LOG.info("Allocated container completed:" + completedId + " last allocated to task: " + task);
appContainerStatus.put(task, containerStatus);
continue;
}
// container neither allocated nor released
if (delayedContainer != null) {
LOG.info("Delayed container {} completed", containerStatus.getContainerId());
maybeRescheduleContainerAtPriority(delayedContainer.getContainer().getPriority());
} else {
LOG.info("Ignoring unknown container: " + containerStatus.getContainerId());
}
}
}
// upcall to app must be outside locks
for (Entry<Object, ContainerStatus> entry : appContainerStatus.entrySet()) {
getContext().containerCompleted(entry.getKey(), entry.getValue());
}
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project apex-core by apache.
the class StreamingAppMasterService method execute.
/**
* Main run function for the application master
*
* @throws YarnException
*/
@SuppressWarnings("SleepWhileInLoop")
private void execute() throws YarnException, IOException {
LOG.info("Starting ApplicationMaster");
final Configuration conf = getConfig();
if (UserGroupInformation.isSecurityEnabled()) {
tokenRenewer = new TokenRenewer(dag, true, conf, appAttemptID.getApplicationId().toString());
}
// Register self with ResourceManager
RegisterApplicationMasterResponse response = amRmClient.registerApplicationMaster(appMasterHostname, 0, appMasterTrackingUrl);
// Dump out information about cluster capability as seen by the resource manager
int maxMem = response.getMaximumResourceCapability().getMemory();
int maxVcores = response.getMaximumResourceCapability().getVirtualCores();
int minMem = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
int minVcores = conf.getInt("yarn.scheduler.minimum-allocation-vcores", 0);
LOG.info("Max mem {}m, Min mem {}m, Max vcores {} and Min vcores {} capabililty of resources in this cluster ", maxMem, minMem, maxVcores, minVcores);
long blacklistRemovalTime = dag.getValue(DAGContext.BLACKLISTED_NODE_REMOVAL_TIME_MILLIS);
int maxConsecutiveContainerFailures = dag.getValue(DAGContext.MAX_CONSECUTIVE_CONTAINER_FAILURES_FOR_BLACKLIST);
LOG.info("Blacklist removal time in millis = {}, max consecutive node failure count = {}", blacklistRemovalTime, maxConsecutiveContainerFailures);
// for locality relaxation fall back
Map<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> requestedResources = Maps.newHashMap();
// Setup heartbeat emitter
// TODO poll RM every now and then with an empty request to let RM know that we are alive
// The heartbeat interval after which an AM is timed out by the RM is defined by a config setting:
// RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
// The allocate calls to the RM count as heartbeat so, for now, this additional heartbeat emitter
// is not required.
int loopCounter = -1;
long nodeReportUpdateTime = 0;
// keep track of already requested containers to not request them again while waiting for allocation
int numRequestedContainers = 0;
int numReleasedContainers = 0;
int nextRequestPriority = 0;
// Use override for resource requestor in case of cloudera distribution, to handle host specific requests
ResourceRequestHandler resourceRequestor = System.getenv().containsKey("CDH_HADOOP_BIN") ? new BlacklistBasedResourceRequestHandler() : new ResourceRequestHandler();
List<ContainerStartRequest> pendingContainerStartRequests = new LinkedList<>();
try (YarnClient clientRMService = StramClientUtils.createYarnClient(conf)) {
try {
// YARN-435
// we need getClusterNodes to populate the initial node list,
// subsequent updates come through the heartbeat response
ApplicationReport ar = StramClientUtils.getStartedAppInstanceByName(clientRMService, dag.getAttributes().get(DAG.APPLICATION_NAME), UserGroupInformation.getLoginUser().getUserName(), dag.getAttributes().get(DAG.APPLICATION_ID));
if (ar != null) {
appDone = true;
dnmgr.shutdownDiagnosticsMessage = String.format("Application master failed due to application %s with duplicate application name \"%s\" by the same user \"%s\" is already started.", ar.getApplicationId().toString(), ar.getName(), ar.getUser());
LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
finishApplication(FinalApplicationStatus.FAILED);
return;
}
resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
nodeReportUpdateTime = System.currentTimeMillis() + UPDATE_NODE_REPORTS_INTERVAL;
} catch (Exception e) {
throw new RuntimeException("Failed to retrieve cluster nodes report.", e);
}
List<Container> containers = response.getContainersFromPreviousAttempts();
// Running containers might take a while to register with the new app master and send the heartbeat signal.
int waitForRecovery = containers.size() > 0 ? dag.getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS) / 1000 : 0;
List<ContainerId> releasedContainers = previouslyAllocatedContainers(containers);
FinalApplicationStatus finalStatus = FinalApplicationStatus.SUCCEEDED;
final InetSocketAddress rmAddress = conf.getSocketAddr(YarnConfiguration.RM_ADDRESS, YarnConfiguration.DEFAULT_RM_ADDRESS, YarnConfiguration.DEFAULT_RM_PORT);
while (!appDone) {
loopCounter++;
final long currentTimeMillis = System.currentTimeMillis();
if (tokenRenewer != null) {
tokenRenewer.checkAndRenew();
}
if (currentTimeMillis > nodeReportUpdateTime) {
resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
nodeReportUpdateTime = currentTimeMillis + UPDATE_NODE_REPORTS_INTERVAL;
}
Runnable r;
while ((r = this.pendingTasks.poll()) != null) {
r.run();
}
// need not have any available containers
try {
sleep(1000);
} catch (InterruptedException e) {
LOG.info("Sleep interrupted", e);
}
// Setup request to be sent to RM to allocate containers
List<ContainerRequest> containerRequests = new ArrayList<>();
List<ContainerRequest> removedContainerRequests = new ArrayList<>();
// request containers for pending deploy requests
if (!dnmgr.containerStartRequests.isEmpty()) {
StreamingContainerAgent.ContainerStartRequest csr;
while ((csr = dnmgr.containerStartRequests.poll()) != null) {
if (csr.container.getRequiredMemoryMB() > maxMem) {
LOG.warn("Container memory {}m above max threshold of cluster. Using max value {}m.", csr.container.getRequiredMemoryMB(), maxMem);
csr.container.setRequiredMemoryMB(maxMem);
}
if (csr.container.getRequiredMemoryMB() < minMem) {
csr.container.setRequiredMemoryMB(minMem);
}
if (csr.container.getRequiredVCores() > maxVcores) {
LOG.warn("Container vcores {} above max threshold of cluster. Using max value {}.", csr.container.getRequiredVCores(), maxVcores);
csr.container.setRequiredVCores(maxVcores);
}
if (csr.container.getRequiredVCores() < minVcores) {
csr.container.setRequiredVCores(minVcores);
}
csr.container.setResourceRequestPriority(nextRequestPriority++);
ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
if (cr == null) {
pendingContainerStartRequests.add(csr);
} else {
resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
}
}
}
// If all other requests are allocated, retry pending requests which need host availability
if (containerRequests.isEmpty() && !pendingContainerStartRequests.isEmpty()) {
List<ContainerStartRequest> removalList = new LinkedList<>();
for (ContainerStartRequest csr : pendingContainerStartRequests) {
ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
if (cr != null) {
resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
removalList.add(csr);
}
}
pendingContainerStartRequests.removeAll(removalList);
}
resourceRequestor.reissueContainerRequests(amRmClient, requestedResources, loopCounter, resourceRequestor, containerRequests, removedContainerRequests);
/* Remove nodes from blacklist after timeout */
List<String> blacklistRemovals = new ArrayList<>();
for (String hostname : failedBlackListedNodes) {
Long timeDiff = currentTimeMillis - failedContainerNodesMap.get(hostname).blackListAdditionTime;
if (timeDiff >= blacklistRemovalTime) {
blacklistRemovals.add(hostname);
failedContainerNodesMap.remove(hostname);
}
}
if (!blacklistRemovals.isEmpty()) {
amRmClient.updateBlacklist(null, blacklistRemovals);
LOG.info("Removing nodes {} from blacklist: time elapsed since last blacklisting due to failure is greater than specified timeout", blacklistRemovals.toString());
failedBlackListedNodes.removeAll(blacklistRemovals);
}
numRequestedContainers += containerRequests.size() - removedContainerRequests.size();
AllocateResponse amResp = sendContainerAskToRM(containerRequests, removedContainerRequests, releasedContainers);
if (amResp.getAMCommand() != null) {
LOG.info(" statement executed:{}", amResp.getAMCommand());
switch(amResp.getAMCommand()) {
case AM_RESYNC:
case AM_SHUTDOWN:
throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
default:
throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
}
}
releasedContainers.clear();
// Retrieve list of allocated containers from the response
List<Container> newAllocatedContainers = amResp.getAllocatedContainers();
// LOG.info("Got response from RM for container ask, allocatedCnt=" + newAllocatedContainers.size());
numRequestedContainers -= newAllocatedContainers.size();
long timestamp = System.currentTimeMillis();
for (Container allocatedContainer : newAllocatedContainers) {
LOG.info("Got new container." + ", containerId=" + allocatedContainer.getId() + ", containerNode=" + allocatedContainer.getNodeId() + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory" + allocatedContainer.getResource().getMemory() + ", priority" + allocatedContainer.getPriority());
// + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
boolean alreadyAllocated = true;
StreamingContainerAgent.ContainerStartRequest csr = null;
for (Map.Entry<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> entry : requestedResources.entrySet()) {
if (entry.getKey().container.getResourceRequestPriority() == allocatedContainer.getPriority().getPriority()) {
alreadyAllocated = false;
csr = entry.getKey();
break;
}
}
if (alreadyAllocated) {
LOG.info("Releasing {} as resource with priority {} was already assigned", allocatedContainer.getId(), allocatedContainer.getPriority());
releasedContainers.add(allocatedContainer.getId());
numReleasedContainers++;
// undo the decrement above for this allocated container
numRequestedContainers++;
continue;
}
if (csr != null) {
requestedResources.remove(csr);
}
// allocate resource to container
ContainerResource resource = new ContainerResource(allocatedContainer.getPriority().getPriority(), allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), allocatedContainer.getResource().getMemory(), allocatedContainer.getResource().getVirtualCores(), allocatedContainer.getNodeHttpAddress());
StreamingContainerAgent sca = dnmgr.assignContainer(resource, null);
if (sca == null) {
// allocated container no longer needed, add release request
LOG.warn("Container {} allocated but nothing to deploy, going to release this container.", allocatedContainer.getId());
releasedContainers.add(allocatedContainer.getId());
} else {
AllocatedContainer allocatedContainerHolder = new AllocatedContainer(allocatedContainer);
this.allocatedContainers.put(allocatedContainer.getId().toString(), allocatedContainerHolder);
ByteBuffer tokens = null;
if (UserGroupInformation.isSecurityEnabled()) {
UserGroupInformation ugi = UserGroupInformation.getLoginUser();
Token<StramDelegationTokenIdentifier> delegationToken = allocateDelegationToken(ugi.getUserName(), heartbeatListener.getAddress());
allocatedContainerHolder.delegationToken = delegationToken;
// ByteBuffer tokens = LaunchContainerRunnable.getTokens(delegationTokenManager, heartbeatListener.getAddress());
tokens = LaunchContainerRunnable.getTokens(ugi, delegationToken);
}
LaunchContainerRunnable launchContainer = new LaunchContainerRunnable(allocatedContainer, nmClient, sca, tokens);
// Thread launchThread = new Thread(runnableLaunchContainer);
// launchThreads.add(launchThread);
// launchThread.start();
// communication with NMs is now async
launchContainer.run();
// record container start event
StramEvent ev = new StramEvent.StartContainerEvent(allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), groupingManager.getEventGroupIdForAffectedContainer(allocatedContainer.getId().toString()));
ev.setTimestamp(timestamp);
dnmgr.recordEventAsync(ev);
}
}
// track node updates for future locality constraint allocations
// TODO: it seems 2.0.4-alpha doesn't give us any updates
resourceRequestor.updateNodeReports(amResp.getUpdatedNodes());
// Check the completed containers
List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
// LOG.debug("Got response from RM for container ask, completedCnt=" + completedContainers.size());
List<String> blacklistAdditions = new ArrayList<>();
for (ContainerStatus containerStatus : completedContainers) {
LOG.info("Completed containerId=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
// non complete containers should not be here
assert (containerStatus.getState() == ContainerState.COMPLETE);
AllocatedContainer allocatedContainer = allocatedContainers.remove(containerStatus.getContainerId().toString());
if (allocatedContainer != null && allocatedContainer.delegationToken != null) {
UserGroupInformation ugi = UserGroupInformation.getLoginUser();
delegationTokenManager.cancelToken(allocatedContainer.delegationToken, ugi.getUserName());
}
EventGroupId groupId = null;
int exitStatus = containerStatus.getExitStatus();
if (0 != exitStatus) {
if (allocatedContainer != null) {
numFailedContainers.incrementAndGet();
if (exitStatus != 1 && maxConsecutiveContainerFailures != Integer.MAX_VALUE) {
// If container failure due to framework
String hostname = allocatedContainer.container.getNodeId().getHost();
if (!failedBlackListedNodes.contains(hostname)) {
// Blacklist the node if not already blacklisted
if (failedContainerNodesMap.containsKey(hostname)) {
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
long timeStamp = System.currentTimeMillis();
if (timeStamp - stats.lastFailureTimeStamp >= blacklistRemovalTime) {
// Reset failure count if last failure was before Blacklist removal time
stats.failureCount = 1;
stats.lastFailureTimeStamp = timeStamp;
} else {
stats.lastFailureTimeStamp = timeStamp;
stats.failureCount++;
if (stats.failureCount >= maxConsecutiveContainerFailures) {
LOG.info("Node {} failed {} times consecutively within {} minutes, marking the node blacklisted", hostname, stats.failureCount, blacklistRemovalTime / (60 * 1000));
blacklistAdditions.add(hostname);
failedBlackListedNodes.add(hostname);
}
}
} else {
failedContainerNodesMap.put(hostname, new NodeFailureStats(System.currentTimeMillis(), 1));
}
}
}
}
// if (exitStatus == 1) {
// // non-recoverable StreamingContainer failure
// appDone = true;
// finalStatus = FinalApplicationStatus.FAILED;
// dnmgr.shutdownDiagnosticsMessage = "Unrecoverable failure " + containerStatus.getContainerId();
// LOG.info("Exiting due to: {}", dnmgr.shutdownDiagnosticsMessage);
// }
// else {
// Recoverable failure or process killed (externally or via stop request by AM)
// also occurs when a container was released by the application but never assigned/launched
LOG.debug("Container {} failed or killed.", containerStatus.getContainerId());
String containerIdStr = containerStatus.getContainerId().toString();
dnmgr.scheduleContainerRestart(containerIdStr);
groupId = groupingManager.getEventGroupIdForAffectedContainer(containerIdStr);
// }
} else {
// container completed successfully
numCompletedContainers.incrementAndGet();
LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
// Reset counter for node failure, if exists
String hostname = allocatedContainer.container.getNodeId().getHost();
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
if (stats != null) {
stats.failureCount = 0;
}
}
String containerIdStr = containerStatus.getContainerId().toString();
dnmgr.removeContainerAgent(containerIdStr);
// record container stop event
StramEvent ev = new StramEvent.StopContainerEvent(containerIdStr, containerStatus.getExitStatus(), groupId);
ev.setReason(containerStatus.getDiagnostics());
dnmgr.recordEventAsync(ev);
}
if (!blacklistAdditions.isEmpty()) {
amRmClient.updateBlacklist(blacklistAdditions, null);
long timeStamp = System.currentTimeMillis();
for (String hostname : blacklistAdditions) {
NodeFailureStats stats = failedContainerNodesMap.get(hostname);
stats.blackListAdditionTime = timeStamp;
}
}
if (dnmgr.forcedShutdown) {
LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
finalStatus = FinalApplicationStatus.FAILED;
appDone = true;
} else if (allocatedContainers.isEmpty() && numRequestedContainers == 0 && dnmgr.containerStartRequests.isEmpty()) {
LOG.debug("Exiting as no more containers are allocated or requested");
finalStatus = FinalApplicationStatus.SUCCEEDED;
appDone = true;
}
LOG.debug("Current application state: loop={}, appDone={}, requested={}, released={}, completed={}, failed={}, currentAllocated={}, dnmgr.containerStartRequests={}", loopCounter, appDone, numRequestedContainers, numReleasedContainers, numCompletedContainers, numFailedContainers, allocatedContainers.size(), dnmgr.containerStartRequests);
// monitor child containers
dnmgr.monitorHeartbeat(waitForRecovery > 0);
waitForRecovery = Math.max(waitForRecovery - 1, 0);
}
finishApplication(finalStatus);
}
}
Aggregations