Search in sources :

Example 66 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.

the class DagAwareYarnTaskScheduler method onContainersCompleted.

@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    if (stopRequested) {
        return;
    }
    List<TaskStatus> taskStatusList = new ArrayList<>(statuses.size());
    synchronized (this) {
        for (ContainerStatus status : statuses) {
            ContainerId cid = status.getContainerId();
            LOG.info("Container {} completed with status {}", cid, status);
            Object task = releasedContainers.remove(cid);
            if (task == null) {
                HeldContainer hc = heldContainers.get(cid);
                if (hc != null) {
                    task = containerCompleted(hc);
                }
            }
            if (task != null) {
                taskStatusList.add(new TaskStatus(task, status));
            }
        }
    }
    // perform app callback outside of locks
    for (TaskStatus taskStatus : taskStatusList) {
        getContext().containerCompleted(taskStatus.task, taskStatus.status);
    }
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ArrayList(java.util.ArrayList)

Example 67 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.

the class TestTaskScheduler method testTaskSchedulerNoReuse.

@SuppressWarnings({ "unchecked" })
@Test(timeout = 10000)
public void testTaskSchedulerNoReuse() throws Exception {
    AMRMClientAsyncForTest mockRMClient = spy(new AMRMClientAsyncForTest(new AMRMClientForTest(), 100));
    Configuration conf = new Configuration();
    conf.setBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_ENABLED, false);
    int interval = 100;
    conf.setInt(TezConfiguration.TEZ_AM_RM_HEARTBEAT_INTERVAL_MS_MAX, interval);
    TaskSchedulerContext mockApp = setupMockTaskSchedulerContext(DEFAULT_APP_HOST, DEFAULT_APP_PORT, DEFAULT_APP_URL, conf);
    TaskSchedulerContextDrainable drainableAppCallback = createDrainableContext(mockApp);
    TaskSchedulerWithDrainableContext scheduler = new TaskSchedulerWithDrainableContext(drainableAppCallback, mockRMClient);
    scheduler.initialize();
    drainableAppCallback.drain();
    // Verifying the validity of the configuration via the interval only instead of making sure
    // it's the same instance.
    verify(mockRMClient).setHeartbeatInterval(interval);
    scheduler.start();
    drainableAppCallback.drain();
    verify(mockRMClient).start();
    verify(mockRMClient).registerApplicationMaster(DEFAULT_APP_HOST, DEFAULT_APP_PORT, DEFAULT_APP_URL);
    RegisterApplicationMasterResponse regResponse = mockRMClient.getRegistrationResponse();
    verify(mockApp).setApplicationRegistrationData(regResponse.getMaximumResourceCapability(), regResponse.getApplicationACLs(), regResponse.getClientToAMTokenMasterKey(), regResponse.getQueue());
    Assert.assertEquals(scheduler.getClusterNodeCount(), mockRMClient.getClusterNodeCount());
    Object mockTask1 = mock(Object.class);
    Object mockCookie1 = mock(Object.class);
    Resource mockCapability = mock(Resource.class);
    String[] hosts = { "host1", "host5" };
    String[] racks = { "/default-rack", "/default-rack" };
    Priority mockPriority = mock(Priority.class);
    ArgumentCaptor<CookieContainerRequest> requestCaptor = ArgumentCaptor.forClass(CookieContainerRequest.class);
    // allocate task
    scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
    drainableAppCallback.drain();
    verify(mockRMClient, times(1)).addContainerRequest((CookieContainerRequest) any());
    // returned from task requests before allocation happens
    assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
    verify(mockApp, times(0)).containerBeingReleased(any(ContainerId.class));
    verify(mockRMClient, times(1)).removeContainerRequest((CookieContainerRequest) any());
    verify(mockRMClient, times(0)).releaseAssignedContainer((ContainerId) any());
    // deallocating unknown task
    assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
    verify(mockApp, times(0)).containerBeingReleased(any(ContainerId.class));
    verify(mockRMClient, times(1)).removeContainerRequest((CookieContainerRequest) any());
    verify(mockRMClient, times(0)).releaseAssignedContainer((ContainerId) any());
    // allocate tasks
    Object mockTask2 = mock(Object.class);
    Object mockCookie2 = mock(Object.class);
    Object mockTask3 = mock(Object.class);
    Object mockCookie3 = mock(Object.class);
    scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
    drainableAppCallback.drain();
    verify(mockRMClient, times(2)).addContainerRequest(requestCaptor.capture());
    CookieContainerRequest request1 = requestCaptor.getValue();
    scheduler.allocateTask(mockTask2, mockCapability, hosts, racks, mockPriority, null, mockCookie2);
    drainableAppCallback.drain();
    verify(mockRMClient, times(3)).addContainerRequest(requestCaptor.capture());
    CookieContainerRequest request2 = requestCaptor.getValue();
    scheduler.allocateTask(mockTask3, mockCapability, hosts, racks, mockPriority, null, mockCookie3);
    drainableAppCallback.drain();
    verify(mockRMClient, times(4)).addContainerRequest(requestCaptor.capture());
    CookieContainerRequest request3 = requestCaptor.getValue();
    List<Container> containers = new ArrayList<Container>();
    Container mockContainer1 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer1.getNodeId().getHost()).thenReturn("host1");
    ContainerId mockCId1 = mock(ContainerId.class);
    when(mockContainer1.getId()).thenReturn(mockCId1);
    containers.add(mockContainer1);
    Container mockContainer2 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer2.getNodeId().getHost()).thenReturn("host2");
    ContainerId mockCId2 = mock(ContainerId.class);
    when(mockContainer2.getId()).thenReturn(mockCId2);
    containers.add(mockContainer2);
    Container mockContainer3 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer3.getNodeId().getHost()).thenReturn("host3");
    ContainerId mockCId3 = mock(ContainerId.class);
    when(mockContainer3.getId()).thenReturn(mockCId3);
    containers.add(mockContainer3);
    Container mockContainer4 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer4.getNodeId().getHost()).thenReturn("host4");
    ContainerId mockCId4 = mock(ContainerId.class);
    when(mockContainer4.getId()).thenReturn(mockCId4);
    containers.add(mockContainer4);
    scheduler.onContainersAllocated(containers);
    drainableAppCallback.drain();
    // first container allocated
    verify(mockApp).taskAllocated(mockTask1, mockCookie1, mockContainer1);
    verify(mockApp).taskAllocated(mockTask2, mockCookie2, mockContainer2);
    verify(mockApp).taskAllocated(mockTask3, mockCookie3, mockContainer3);
    // no other allocations returned
    verify(mockApp, times(3)).taskAllocated(any(), any(), (Container) any());
    verify(mockRMClient).removeContainerRequest(request1);
    verify(mockRMClient).removeContainerRequest(request2);
    verify(mockRMClient).removeContainerRequest(request3);
    // verify unwanted container released
    verify(mockRMClient).releaseAssignedContainer(mockCId4);
    // deallocate allocated task
    assertTrue(scheduler.deallocateTask(mockTask1, true, null, null));
    drainableAppCallback.drain();
    verify(mockApp).containerBeingReleased(mockCId1);
    verify(mockRMClient).releaseAssignedContainer(mockCId1);
    // deallocate allocated container
    Assert.assertEquals(mockTask2, scheduler.deallocateContainer(mockCId2));
    drainableAppCallback.drain();
    verify(mockRMClient).releaseAssignedContainer(mockCId2);
    verify(mockRMClient, times(3)).releaseAssignedContainer((ContainerId) any());
    List<ContainerStatus> statuses = new ArrayList<ContainerStatus>();
    ContainerStatus mockStatus1 = mock(ContainerStatus.class);
    when(mockStatus1.getContainerId()).thenReturn(mockCId1);
    statuses.add(mockStatus1);
    ContainerStatus mockStatus2 = mock(ContainerStatus.class);
    when(mockStatus2.getContainerId()).thenReturn(mockCId2);
    statuses.add(mockStatus2);
    ContainerStatus mockStatus3 = mock(ContainerStatus.class);
    when(mockStatus3.getContainerId()).thenReturn(mockCId3);
    statuses.add(mockStatus3);
    ContainerStatus mockStatus4 = mock(ContainerStatus.class);
    when(mockStatus4.getContainerId()).thenReturn(mockCId4);
    statuses.add(mockStatus4);
    scheduler.onContainersCompleted(statuses);
    drainableAppCallback.drain();
    // released container status returned
    verify(mockApp).containerCompleted(mockTask1, mockStatus1);
    verify(mockApp).containerCompleted(mockTask2, mockStatus2);
    // currently allocated container status returned and not released
    verify(mockApp).containerCompleted(mockTask3, mockStatus3);
    // no other statuses returned
    verify(mockApp, times(3)).containerCompleted(any(), (ContainerStatus) any());
    verify(mockRMClient, times(3)).releaseAssignedContainer((ContainerId) any());
    // verify blacklisting
    verify(mockRMClient, times(0)).addNodeToBlacklist((NodeId) any());
    String badHost = "host6";
    NodeId badNodeId = mock(NodeId.class);
    when(badNodeId.getHost()).thenReturn(badHost);
    scheduler.blacklistNode(badNodeId);
    verify(mockRMClient, times(1)).addNodeToBlacklist(badNodeId);
    Object mockTask4 = mock(Object.class);
    Object mockCookie4 = mock(Object.class);
    scheduler.allocateTask(mockTask4, mockCapability, null, null, mockPriority, null, mockCookie4);
    drainableAppCallback.drain();
    verify(mockRMClient, times(5)).addContainerRequest(requestCaptor.capture());
    Container mockContainer5 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer5.getNodeId().getHost()).thenReturn(badHost);
    when(mockContainer5.getNodeId()).thenReturn(badNodeId);
    ContainerId mockCId5 = mock(ContainerId.class);
    when(mockContainer5.getId()).thenReturn(mockCId5);
    containers.clear();
    containers.add(mockContainer5);
    scheduler.onContainersAllocated(containers);
    drainableAppCallback.drain();
    // no new allocation
    verify(mockApp, times(3)).taskAllocated(any(), any(), (Container) any());
    // verify blacklisted container released
    verify(mockRMClient).releaseAssignedContainer(mockCId5);
    verify(mockRMClient, times(4)).releaseAssignedContainer((ContainerId) any());
    // verify request added back
    verify(mockRMClient, times(6)).addContainerRequest(requestCaptor.capture());
    Container mockContainer6 = mock(Container.class, RETURNS_DEEP_STUBS);
    when(mockContainer6.getNodeId().getHost()).thenReturn("host7");
    ContainerId mockCId6 = mock(ContainerId.class);
    when(mockContainer6.getId()).thenReturn(mockCId6);
    containers.clear();
    containers.add(mockContainer6);
    scheduler.onContainersAllocated(containers);
    drainableAppCallback.drain();
    // new allocation
    verify(mockApp, times(4)).taskAllocated(any(), any(), (Container) any());
    verify(mockApp).taskAllocated(mockTask4, mockCookie4, mockContainer6);
    // deallocate allocated task
    assertTrue(scheduler.deallocateTask(mockTask4, true, null, null));
    drainableAppCallback.drain();
    verify(mockApp).containerBeingReleased(mockCId6);
    verify(mockRMClient).releaseAssignedContainer(mockCId6);
    verify(mockRMClient, times(5)).releaseAssignedContainer((ContainerId) any());
    // test unblacklist
    scheduler.unblacklistNode(badNodeId);
    verify(mockRMClient, times(1)).removeNodeFromBlacklist(badNodeId);
    assertEquals(0, scheduler.blacklistedNodes.size());
    float progress = 0.5f;
    when(mockApp.getProgress()).thenReturn(progress);
    Assert.assertEquals(progress, scheduler.getProgress(), 0);
    // check duplicate allocation request
    scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
    drainableAppCallback.drain();
    verify(mockRMClient, times(7)).addContainerRequest((CookieContainerRequest) any());
    verify(mockRMClient, times(6)).removeContainerRequest((CookieContainerRequest) any());
    scheduler.allocateTask(mockTask1, mockCapability, hosts, racks, mockPriority, null, mockCookie1);
    drainableAppCallback.drain();
    // old request removed and new one added
    verify(mockRMClient, times(7)).removeContainerRequest((CookieContainerRequest) any());
    verify(mockRMClient, times(8)).addContainerRequest((CookieContainerRequest) any());
    assertFalse(scheduler.deallocateTask(mockTask1, true, null, null));
    List<NodeReport> mockUpdatedNodes = mock(List.class);
    scheduler.onNodesUpdated(mockUpdatedNodes);
    drainableAppCallback.drain();
    verify(mockApp).nodesUpdated(mockUpdatedNodes);
    ArgumentCaptor<String> argumentCaptor = ArgumentCaptor.forClass(String.class);
    Exception mockException = new IOException("mockexception");
    scheduler.onError(mockException);
    drainableAppCallback.drain();
    verify(mockApp).reportError(eq(YarnTaskSchedulerServiceError.RESOURCEMANAGER_ERROR), argumentCaptor.capture(), any(DagInfo.class));
    assertTrue(argumentCaptor.getValue().contains("mockexception"));
    scheduler.onShutdownRequest();
    drainableAppCallback.drain();
    verify(mockApp).appShutdownRequested();
    AppFinalStatus finalStatus = new AppFinalStatus(FinalApplicationStatus.SUCCEEDED, SUCCEED_APP_MESSAGE, DEFAULT_APP_URL);
    when(mockApp.getFinalAppStatus()).thenReturn(finalStatus);
    scheduler.shutdown();
    drainableAppCallback.drain();
    verify(mockRMClient).unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, SUCCEED_APP_MESSAGE, DEFAULT_APP_URL);
    verify(mockRMClient).stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TestTaskSchedulerHelpers.setupMockTaskSchedulerContext(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.setupMockTaskSchedulerContext) TaskSchedulerContext(org.apache.tez.serviceplugins.api.TaskSchedulerContext) DagInfo(org.apache.tez.serviceplugins.api.DagInfo) ArrayList(java.util.ArrayList) TaskSchedulerWithDrainableContext(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.TaskSchedulerWithDrainableContext) Matchers.anyString(org.mockito.Matchers.anyString) HeldContainer(org.apache.tez.dag.app.rm.YarnTaskSchedulerService.HeldContainer) Container(org.apache.hadoop.yarn.api.records.Container) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) AppFinalStatus(org.apache.tez.serviceplugins.api.TaskSchedulerContext.AppFinalStatus) AMRMClientAsyncForTest(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientAsyncForTest) TaskSchedulerContextDrainable(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.TaskSchedulerContextDrainable) Priority(org.apache.hadoop.yarn.api.records.Priority) Resource(org.apache.hadoop.yarn.api.records.Resource) IOException(java.io.IOException) IOException(java.io.IOException) CookieContainerRequest(org.apache.tez.dag.app.rm.YarnTaskSchedulerService.CookieContainerRequest) AMRMClientForTest(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientForTest) RegisterApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse) NodeId(org.apache.hadoop.yarn.api.records.NodeId) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) AMRMClientForTest(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientForTest) AMRMClientAsyncForTest(org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientAsyncForTest) Test(org.junit.Test)

Example 68 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.

the class TestTaskSchedulerManager method testContainerDiskFailed.

@Test(timeout = 5000)
public void testContainerDiskFailed() throws IOException {
    Configuration conf = new Configuration(false);
    schedulerHandler.init(conf);
    schedulerHandler.start();
    String diagnostics = "NM disk failed.";
    TaskAttemptImpl mockTask = mock(TaskAttemptImpl.class);
    ContainerStatus mockStatus = mock(ContainerStatus.class);
    ContainerId mockCId = mock(ContainerId.class);
    AMContainer mockAMContainer = mock(AMContainer.class);
    when(mockAMContainerMap.get(mockCId)).thenReturn(mockAMContainer);
    when(mockAMContainer.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getContainerId()).thenReturn(mockCId);
    when(mockStatus.getDiagnostics()).thenReturn(diagnostics);
    when(mockStatus.getExitStatus()).thenReturn(ContainerExitStatus.DISKS_FAILED);
    schedulerHandler.containerCompleted(0, mockTask, mockStatus);
    assertEquals(1, mockEventHandler.events.size());
    Event event = mockEventHandler.events.get(0);
    assertEquals(AMContainerEventType.C_COMPLETED, event.getType());
    AMContainerEventCompleted completedEvent = (AMContainerEventCompleted) event;
    assertEquals(mockCId, completedEvent.getContainerId());
    assertEquals("Container disk failed. NM disk failed.", completedEvent.getDiagnostics());
    Assert.assertFalse(completedEvent.isPreempted());
    assertTrue(completedEvent.isDiskFailed());
    assertEquals(TaskAttemptTerminationCause.NODE_DISK_ERROR, completedEvent.getTerminationCause());
    schedulerHandler.stop();
    schedulerHandler.close();
}
Also used : AMContainerEventCompleted(org.apache.tez.dag.app.rm.container.AMContainerEventCompleted) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TaskAttemptImpl(org.apache.tez.dag.app.dag.impl.TaskAttemptImpl) Event(org.apache.hadoop.yarn.event.Event) AMContainer(org.apache.tez.dag.app.rm.container.AMContainer) DagInfoImplForTest(org.apache.tez.dag.helpers.DagInfoImplForTest) Test(org.junit.Test)

Example 69 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project tez by apache.

the class YarnTaskSchedulerService method onContainersCompleted.

// AMRMClientAsync interface methods
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
    if (isStopStarted.get()) {
        if (LOG.isDebugEnabled()) {
            for (ContainerStatus status : statuses) {
                LOG.debug("Container " + status.getContainerId() + " is completed with ContainerStatus=" + status);
            }
        }
        return;
    }
    Map<Object, ContainerStatus> appContainerStatus = new HashMap<Object, ContainerStatus>(statuses.size());
    synchronized (this) {
        for (ContainerStatus containerStatus : statuses) {
            ContainerId completedId = containerStatus.getContainerId();
            HeldContainer delayedContainer = heldContainers.get(completedId);
            Object task = releasedContainers.remove(completedId);
            if (task != null) {
                if (delayedContainer != null) {
                    LOG.warn("Held container should be null since releasedContainer is not");
                }
                // an allocated container completed. notify app
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Released container completed:" + completedId + " last allocated to task: " + task);
                }
                appContainerStatus.put(task, containerStatus);
                continue;
            }
            // not found in released containers. check currently allocated containers
            // no need to release this container as the RM has already completed it
            task = unAssignContainer(completedId, false);
            if (delayedContainer != null) {
                heldContainers.remove(completedId);
                Resources.subtract(allocatedResources, delayedContainer.getContainer().getResource());
            } else {
                LOG.warn("Held container expected to be not null for a non-AM-released container");
            }
            if (task != null) {
                // completion of a container we have allocated currently
                // an allocated container completed. notify app. This will cause attempt to get killed
                LOG.info("Allocated container completed:" + completedId + " last allocated to task: " + task);
                appContainerStatus.put(task, containerStatus);
                continue;
            }
            // container neither allocated nor released
            if (delayedContainer != null) {
                LOG.info("Delayed container {} completed", containerStatus.getContainerId());
                maybeRescheduleContainerAtPriority(delayedContainer.getContainer().getPriority());
            } else {
                LOG.info("Ignoring unknown container: " + containerStatus.getContainerId());
            }
        }
    }
    // upcall to app must be outside locks
    for (Entry<Object, ContainerStatus> entry : appContainerStatus.entrySet()) {
        getContext().containerCompleted(entry.getKey(), entry.getValue());
    }
}
Also used : ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId)

Example 70 with ContainerStatus

use of org.apache.hadoop.yarn.api.records.ContainerStatus in project apex-core by apache.

the class StreamingAppMasterService method execute.

/**
 * Main run function for the application master
 *
 * @throws YarnException
 */
@SuppressWarnings("SleepWhileInLoop")
private void execute() throws YarnException, IOException {
    LOG.info("Starting ApplicationMaster");
    final Configuration conf = getConfig();
    if (UserGroupInformation.isSecurityEnabled()) {
        tokenRenewer = new TokenRenewer(dag, true, conf, appAttemptID.getApplicationId().toString());
    }
    // Register self with ResourceManager
    RegisterApplicationMasterResponse response = amRmClient.registerApplicationMaster(appMasterHostname, 0, appMasterTrackingUrl);
    // Dump out information about cluster capability as seen by the resource manager
    int maxMem = response.getMaximumResourceCapability().getMemory();
    int maxVcores = response.getMaximumResourceCapability().getVirtualCores();
    int minMem = conf.getInt("yarn.scheduler.minimum-allocation-mb", 0);
    int minVcores = conf.getInt("yarn.scheduler.minimum-allocation-vcores", 0);
    LOG.info("Max mem {}m, Min mem {}m, Max vcores {} and Min vcores {} capabililty of resources in this cluster ", maxMem, minMem, maxVcores, minVcores);
    long blacklistRemovalTime = dag.getValue(DAGContext.BLACKLISTED_NODE_REMOVAL_TIME_MILLIS);
    int maxConsecutiveContainerFailures = dag.getValue(DAGContext.MAX_CONSECUTIVE_CONTAINER_FAILURES_FOR_BLACKLIST);
    LOG.info("Blacklist removal time in millis = {}, max consecutive node failure count = {}", blacklistRemovalTime, maxConsecutiveContainerFailures);
    // for locality relaxation fall back
    Map<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> requestedResources = Maps.newHashMap();
    // Setup heartbeat emitter
    // TODO poll RM every now and then with an empty request to let RM know that we are alive
    // The heartbeat interval after which an AM is timed out by the RM is defined by a config setting:
    // RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
    // The allocate calls to the RM count as heartbeat so, for now, this additional heartbeat emitter
    // is not required.
    int loopCounter = -1;
    long nodeReportUpdateTime = 0;
    // keep track of already requested containers to not request them again while waiting for allocation
    int numRequestedContainers = 0;
    int numReleasedContainers = 0;
    int nextRequestPriority = 0;
    // Use override for resource requestor in case of cloudera distribution, to handle host specific requests
    ResourceRequestHandler resourceRequestor = System.getenv().containsKey("CDH_HADOOP_BIN") ? new BlacklistBasedResourceRequestHandler() : new ResourceRequestHandler();
    List<ContainerStartRequest> pendingContainerStartRequests = new LinkedList<>();
    try (YarnClient clientRMService = StramClientUtils.createYarnClient(conf)) {
        try {
            // YARN-435
            // we need getClusterNodes to populate the initial node list,
            // subsequent updates come through the heartbeat response
            ApplicationReport ar = StramClientUtils.getStartedAppInstanceByName(clientRMService, dag.getAttributes().get(DAG.APPLICATION_NAME), UserGroupInformation.getLoginUser().getUserName(), dag.getAttributes().get(DAG.APPLICATION_ID));
            if (ar != null) {
                appDone = true;
                dnmgr.shutdownDiagnosticsMessage = String.format("Application master failed due to application %s with duplicate application name \"%s\" by the same user \"%s\" is already started.", ar.getApplicationId().toString(), ar.getName(), ar.getUser());
                LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
                finishApplication(FinalApplicationStatus.FAILED);
                return;
            }
            resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
            nodeReportUpdateTime = System.currentTimeMillis() + UPDATE_NODE_REPORTS_INTERVAL;
        } catch (Exception e) {
            throw new RuntimeException("Failed to retrieve cluster nodes report.", e);
        }
        List<Container> containers = response.getContainersFromPreviousAttempts();
        // Running containers might take a while to register with the new app master and send the heartbeat signal.
        int waitForRecovery = containers.size() > 0 ? dag.getValue(LogicalPlan.HEARTBEAT_TIMEOUT_MILLIS) / 1000 : 0;
        List<ContainerId> releasedContainers = previouslyAllocatedContainers(containers);
        FinalApplicationStatus finalStatus = FinalApplicationStatus.SUCCEEDED;
        final InetSocketAddress rmAddress = conf.getSocketAddr(YarnConfiguration.RM_ADDRESS, YarnConfiguration.DEFAULT_RM_ADDRESS, YarnConfiguration.DEFAULT_RM_PORT);
        while (!appDone) {
            loopCounter++;
            final long currentTimeMillis = System.currentTimeMillis();
            if (tokenRenewer != null) {
                tokenRenewer.checkAndRenew();
            }
            if (currentTimeMillis > nodeReportUpdateTime) {
                resourceRequestor.updateNodeReports(clientRMService.getNodeReports());
                nodeReportUpdateTime = currentTimeMillis + UPDATE_NODE_REPORTS_INTERVAL;
            }
            Runnable r;
            while ((r = this.pendingTasks.poll()) != null) {
                r.run();
            }
            // need not have any available containers
            try {
                sleep(1000);
            } catch (InterruptedException e) {
                LOG.info("Sleep interrupted", e);
            }
            // Setup request to be sent to RM to allocate containers
            List<ContainerRequest> containerRequests = new ArrayList<>();
            List<ContainerRequest> removedContainerRequests = new ArrayList<>();
            // request containers for pending deploy requests
            if (!dnmgr.containerStartRequests.isEmpty()) {
                StreamingContainerAgent.ContainerStartRequest csr;
                while ((csr = dnmgr.containerStartRequests.poll()) != null) {
                    if (csr.container.getRequiredMemoryMB() > maxMem) {
                        LOG.warn("Container memory {}m above max threshold of cluster. Using max value {}m.", csr.container.getRequiredMemoryMB(), maxMem);
                        csr.container.setRequiredMemoryMB(maxMem);
                    }
                    if (csr.container.getRequiredMemoryMB() < minMem) {
                        csr.container.setRequiredMemoryMB(minMem);
                    }
                    if (csr.container.getRequiredVCores() > maxVcores) {
                        LOG.warn("Container vcores {} above max threshold of cluster. Using max value {}.", csr.container.getRequiredVCores(), maxVcores);
                        csr.container.setRequiredVCores(maxVcores);
                    }
                    if (csr.container.getRequiredVCores() < minVcores) {
                        csr.container.setRequiredVCores(minVcores);
                    }
                    csr.container.setResourceRequestPriority(nextRequestPriority++);
                    ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                    if (cr == null) {
                        pendingContainerStartRequests.add(csr);
                    } else {
                        resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                    }
                }
            }
            // If all other requests are allocated, retry pending requests which need host availability
            if (containerRequests.isEmpty() && !pendingContainerStartRequests.isEmpty()) {
                List<ContainerStartRequest> removalList = new LinkedList<>();
                for (ContainerStartRequest csr : pendingContainerStartRequests) {
                    ContainerRequest cr = resourceRequestor.createContainerRequest(csr, true);
                    if (cr != null) {
                        resourceRequestor.addContainerRequest(requestedResources, loopCounter, containerRequests, csr, cr);
                        removalList.add(csr);
                    }
                }
                pendingContainerStartRequests.removeAll(removalList);
            }
            resourceRequestor.reissueContainerRequests(amRmClient, requestedResources, loopCounter, resourceRequestor, containerRequests, removedContainerRequests);
            /* Remove nodes from blacklist after timeout */
            List<String> blacklistRemovals = new ArrayList<>();
            for (String hostname : failedBlackListedNodes) {
                Long timeDiff = currentTimeMillis - failedContainerNodesMap.get(hostname).blackListAdditionTime;
                if (timeDiff >= blacklistRemovalTime) {
                    blacklistRemovals.add(hostname);
                    failedContainerNodesMap.remove(hostname);
                }
            }
            if (!blacklistRemovals.isEmpty()) {
                amRmClient.updateBlacklist(null, blacklistRemovals);
                LOG.info("Removing nodes {} from blacklist: time elapsed since last blacklisting due to failure is greater than specified timeout", blacklistRemovals.toString());
                failedBlackListedNodes.removeAll(blacklistRemovals);
            }
            numRequestedContainers += containerRequests.size() - removedContainerRequests.size();
            AllocateResponse amResp = sendContainerAskToRM(containerRequests, removedContainerRequests, releasedContainers);
            if (amResp.getAMCommand() != null) {
                LOG.info(" statement executed:{}", amResp.getAMCommand());
                switch(amResp.getAMCommand()) {
                    case AM_RESYNC:
                    case AM_SHUTDOWN:
                        throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
                    default:
                        throw new YarnRuntimeException("Received the " + amResp.getAMCommand() + " command from RM");
                }
            }
            releasedContainers.clear();
            // Retrieve list of allocated containers from the response
            List<Container> newAllocatedContainers = amResp.getAllocatedContainers();
            // LOG.info("Got response from RM for container ask, allocatedCnt=" + newAllocatedContainers.size());
            numRequestedContainers -= newAllocatedContainers.size();
            long timestamp = System.currentTimeMillis();
            for (Container allocatedContainer : newAllocatedContainers) {
                LOG.info("Got new container." + ", containerId=" + allocatedContainer.getId() + ", containerNode=" + allocatedContainer.getNodeId() + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory" + allocatedContainer.getResource().getMemory() + ", priority" + allocatedContainer.getPriority());
                // + ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
                boolean alreadyAllocated = true;
                StreamingContainerAgent.ContainerStartRequest csr = null;
                for (Map.Entry<StreamingContainerAgent.ContainerStartRequest, MutablePair<Integer, ContainerRequest>> entry : requestedResources.entrySet()) {
                    if (entry.getKey().container.getResourceRequestPriority() == allocatedContainer.getPriority().getPriority()) {
                        alreadyAllocated = false;
                        csr = entry.getKey();
                        break;
                    }
                }
                if (alreadyAllocated) {
                    LOG.info("Releasing {} as resource with priority {} was already assigned", allocatedContainer.getId(), allocatedContainer.getPriority());
                    releasedContainers.add(allocatedContainer.getId());
                    numReleasedContainers++;
                    // undo the decrement above for this allocated container
                    numRequestedContainers++;
                    continue;
                }
                if (csr != null) {
                    requestedResources.remove(csr);
                }
                // allocate resource to container
                ContainerResource resource = new ContainerResource(allocatedContainer.getPriority().getPriority(), allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), allocatedContainer.getResource().getMemory(), allocatedContainer.getResource().getVirtualCores(), allocatedContainer.getNodeHttpAddress());
                StreamingContainerAgent sca = dnmgr.assignContainer(resource, null);
                if (sca == null) {
                    // allocated container no longer needed, add release request
                    LOG.warn("Container {} allocated but nothing to deploy, going to release this container.", allocatedContainer.getId());
                    releasedContainers.add(allocatedContainer.getId());
                } else {
                    AllocatedContainer allocatedContainerHolder = new AllocatedContainer(allocatedContainer);
                    this.allocatedContainers.put(allocatedContainer.getId().toString(), allocatedContainerHolder);
                    ByteBuffer tokens = null;
                    if (UserGroupInformation.isSecurityEnabled()) {
                        UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                        Token<StramDelegationTokenIdentifier> delegationToken = allocateDelegationToken(ugi.getUserName(), heartbeatListener.getAddress());
                        allocatedContainerHolder.delegationToken = delegationToken;
                        // ByteBuffer tokens = LaunchContainerRunnable.getTokens(delegationTokenManager, heartbeatListener.getAddress());
                        tokens = LaunchContainerRunnable.getTokens(ugi, delegationToken);
                    }
                    LaunchContainerRunnable launchContainer = new LaunchContainerRunnable(allocatedContainer, nmClient, sca, tokens);
                    // Thread launchThread = new Thread(runnableLaunchContainer);
                    // launchThreads.add(launchThread);
                    // launchThread.start();
                    // communication with NMs is now async
                    launchContainer.run();
                    // record container start event
                    StramEvent ev = new StramEvent.StartContainerEvent(allocatedContainer.getId().toString(), allocatedContainer.getNodeId().toString(), groupingManager.getEventGroupIdForAffectedContainer(allocatedContainer.getId().toString()));
                    ev.setTimestamp(timestamp);
                    dnmgr.recordEventAsync(ev);
                }
            }
            // track node updates for future locality constraint allocations
            // TODO: it seems 2.0.4-alpha doesn't give us any updates
            resourceRequestor.updateNodeReports(amResp.getUpdatedNodes());
            // Check the completed containers
            List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
            // LOG.debug("Got response from RM for container ask, completedCnt=" + completedContainers.size());
            List<String> blacklistAdditions = new ArrayList<>();
            for (ContainerStatus containerStatus : completedContainers) {
                LOG.info("Completed containerId=" + containerStatus.getContainerId() + ", state=" + containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus() + ", diagnostics=" + containerStatus.getDiagnostics());
                // non complete containers should not be here
                assert (containerStatus.getState() == ContainerState.COMPLETE);
                AllocatedContainer allocatedContainer = allocatedContainers.remove(containerStatus.getContainerId().toString());
                if (allocatedContainer != null && allocatedContainer.delegationToken != null) {
                    UserGroupInformation ugi = UserGroupInformation.getLoginUser();
                    delegationTokenManager.cancelToken(allocatedContainer.delegationToken, ugi.getUserName());
                }
                EventGroupId groupId = null;
                int exitStatus = containerStatus.getExitStatus();
                if (0 != exitStatus) {
                    if (allocatedContainer != null) {
                        numFailedContainers.incrementAndGet();
                        if (exitStatus != 1 && maxConsecutiveContainerFailures != Integer.MAX_VALUE) {
                            // If container failure due to framework
                            String hostname = allocatedContainer.container.getNodeId().getHost();
                            if (!failedBlackListedNodes.contains(hostname)) {
                                // Blacklist the node if not already blacklisted
                                if (failedContainerNodesMap.containsKey(hostname)) {
                                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                                    long timeStamp = System.currentTimeMillis();
                                    if (timeStamp - stats.lastFailureTimeStamp >= blacklistRemovalTime) {
                                        // Reset failure count if last failure was before Blacklist removal time
                                        stats.failureCount = 1;
                                        stats.lastFailureTimeStamp = timeStamp;
                                    } else {
                                        stats.lastFailureTimeStamp = timeStamp;
                                        stats.failureCount++;
                                        if (stats.failureCount >= maxConsecutiveContainerFailures) {
                                            LOG.info("Node {} failed {} times consecutively within {} minutes, marking the node blacklisted", hostname, stats.failureCount, blacklistRemovalTime / (60 * 1000));
                                            blacklistAdditions.add(hostname);
                                            failedBlackListedNodes.add(hostname);
                                        }
                                    }
                                } else {
                                    failedContainerNodesMap.put(hostname, new NodeFailureStats(System.currentTimeMillis(), 1));
                                }
                            }
                        }
                    }
                    // if (exitStatus == 1) {
                    // // non-recoverable StreamingContainer failure
                    // appDone = true;
                    // finalStatus = FinalApplicationStatus.FAILED;
                    // dnmgr.shutdownDiagnosticsMessage = "Unrecoverable failure " + containerStatus.getContainerId();
                    // LOG.info("Exiting due to: {}", dnmgr.shutdownDiagnosticsMessage);
                    // }
                    // else {
                    // Recoverable failure or process killed (externally or via stop request by AM)
                    // also occurs when a container was released by the application but never assigned/launched
                    LOG.debug("Container {} failed or killed.", containerStatus.getContainerId());
                    String containerIdStr = containerStatus.getContainerId().toString();
                    dnmgr.scheduleContainerRestart(containerIdStr);
                    groupId = groupingManager.getEventGroupIdForAffectedContainer(containerIdStr);
                // }
                } else {
                    // container completed successfully
                    numCompletedContainers.incrementAndGet();
                    LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
                    // Reset counter for node failure, if exists
                    String hostname = allocatedContainer.container.getNodeId().getHost();
                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                    if (stats != null) {
                        stats.failureCount = 0;
                    }
                }
                String containerIdStr = containerStatus.getContainerId().toString();
                dnmgr.removeContainerAgent(containerIdStr);
                // record container stop event
                StramEvent ev = new StramEvent.StopContainerEvent(containerIdStr, containerStatus.getExitStatus(), groupId);
                ev.setReason(containerStatus.getDiagnostics());
                dnmgr.recordEventAsync(ev);
            }
            if (!blacklistAdditions.isEmpty()) {
                amRmClient.updateBlacklist(blacklistAdditions, null);
                long timeStamp = System.currentTimeMillis();
                for (String hostname : blacklistAdditions) {
                    NodeFailureStats stats = failedContainerNodesMap.get(hostname);
                    stats.blackListAdditionTime = timeStamp;
                }
            }
            if (dnmgr.forcedShutdown) {
                LOG.info("Forced shutdown due to {}", dnmgr.shutdownDiagnosticsMessage);
                finalStatus = FinalApplicationStatus.FAILED;
                appDone = true;
            } else if (allocatedContainers.isEmpty() && numRequestedContainers == 0 && dnmgr.containerStartRequests.isEmpty()) {
                LOG.debug("Exiting as no more containers are allocated or requested");
                finalStatus = FinalApplicationStatus.SUCCEEDED;
                appDone = true;
            }
            LOG.debug("Current application state: loop={}, appDone={}, requested={}, released={}, completed={}, failed={}, currentAllocated={}, dnmgr.containerStartRequests={}", loopCounter, appDone, numRequestedContainers, numReleasedContainers, numCompletedContainers, numFailedContainers, allocatedContainers.size(), dnmgr.containerStartRequests);
            // monitor child containers
            dnmgr.monitorHeartbeat(waitForRecovery > 0);
            waitForRecovery = Math.max(waitForRecovery - 1, 0);
        }
        finishApplication(finalStatus);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) PTContainer(com.datatorrent.stram.plan.physical.PTContainer) Container(org.apache.hadoop.yarn.api.records.Container) StreamingContainer(com.datatorrent.stram.engine.StreamingContainer) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) ContainerStartRequest(com.datatorrent.stram.StreamingContainerAgent.ContainerStartRequest) LinkedList(java.util.LinkedList) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ContainerResource(com.datatorrent.stram.StreamingContainerManager.ContainerResource) RegisterApplicationMasterResponse(org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse) Map(java.util.Map) HashMap(java.util.HashMap) ConcurrentMap(java.util.concurrent.ConcurrentMap) StramDelegationTokenIdentifier(com.datatorrent.stram.security.StramDelegationTokenIdentifier) StramEvent(com.datatorrent.stram.api.StramEvent) EventGroupId(org.apache.apex.engine.events.grouping.GroupingRequest.EventGroupId) MutablePair(org.apache.commons.lang3.tuple.MutablePair) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) TokenRenewer(org.apache.apex.engine.security.TokenRenewer) ByteBuffer(java.nio.ByteBuffer) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException)

Aggregations

ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)144 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)76 Test (org.junit.Test)75 ArrayList (java.util.ArrayList)58 Container (org.apache.hadoop.yarn.api.records.Container)40 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)28 NodeId (org.apache.hadoop.yarn.api.records.NodeId)26 HashMap (java.util.HashMap)25 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)25 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)23 Configuration (org.apache.hadoop.conf.Configuration)21 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)21 Resource (org.apache.hadoop.yarn.api.records.Resource)21 GetContainerStatusesRequest (org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest)20 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)20 StartContainerRequest (org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest)19 StartContainersRequest (org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest)18 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)18 AllocateResponse (org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse)17 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)14