Search in sources :

Example 16 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestNodeManagerResync method testNMSentContainerStatusOnResync.

// This is to test when NM gets the resync response from last heart beat, it
// should be able to send the already-sent-via-last-heart-beat container
// statuses again when it re-register with RM.
@Test
public void testNMSentContainerStatusOnResync() throws Exception {
    final ContainerStatus testCompleteContainer = TestNodeStatusUpdater.createContainerStatus(2, ContainerState.COMPLETE);
    final Container container = TestNodeStatusUpdater.getMockContainer(testCompleteContainer);
    NMContainerStatus report = createNMContainerStatus(2, ContainerState.COMPLETE);
    when(container.getNMContainerStatus()).thenReturn(report);
    NodeManager nm = new NodeManager() {

        int registerCount = 0;

        @Override
        protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
            return new TestNodeStatusUpdaterResync(context, dispatcher, healthChecker, metrics) {

                @Override
                protected ResourceTracker createResourceTracker() {
                    return new MockResourceTracker() {

                        @Override
                        public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException {
                            if (registerCount == 0) {
                                // first register, no containers info.
                                try {
                                    Assert.assertEquals(0, request.getNMContainerStatuses().size());
                                } catch (AssertionError error) {
                                    error.printStackTrace();
                                    assertionFailedInThread.set(true);
                                }
                                // put the completed container into the context
                                getNMContext().getContainers().put(testCompleteContainer.getContainerId(), container);
                                getNMContext().getApplications().put(testCompleteContainer.getContainerId().getApplicationAttemptId().getApplicationId(), mock(Application.class));
                            } else {
                                // second register contains the completed container info.
                                List<NMContainerStatus> statuses = request.getNMContainerStatuses();
                                try {
                                    Assert.assertEquals(1, statuses.size());
                                    Assert.assertEquals(testCompleteContainer.getContainerId(), statuses.get(0).getContainerId());
                                } catch (AssertionError error) {
                                    error.printStackTrace();
                                    assertionFailedInThread.set(true);
                                }
                            }
                            registerCount++;
                            return super.registerNodeManager(request);
                        }

                        @Override
                        public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) {
                            // first heartBeat contains the completed container info
                            List<ContainerStatus> statuses = request.getNodeStatus().getContainersStatuses();
                            try {
                                Assert.assertEquals(1, statuses.size());
                                Assert.assertEquals(testCompleteContainer.getContainerId(), statuses.get(0).getContainerId());
                            } catch (AssertionError error) {
                                error.printStackTrace();
                                assertionFailedInThread.set(true);
                            }
                            // notify RESYNC on first heartbeat.
                            return YarnServerBuilderUtils.newNodeHeartbeatResponse(1, NodeAction.RESYNC, null, null, null, null, 1000L);
                        }
                    };
                }
            };
        }
    };
    YarnConfiguration conf = createNMConfig();
    nm.init(conf);
    nm.start();
    try {
        syncBarrier.await();
    } catch (BrokenBarrierException e) {
    }
    Assert.assertFalse(assertionFailedInThread.get());
    nm.stop();
}
Also used : FileContext(org.apache.hadoop.fs.FileContext) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) BrokenBarrierException(java.util.concurrent.BrokenBarrierException) Dispatcher(org.apache.hadoop.yarn.event.Dispatcher) RegisterNodeManagerRequest(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) Container(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) NodeHeartbeatRequest(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest) Application(org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application) BaseContainerManagerTest(org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest) Test(org.junit.Test)

Example 17 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestNodeManagerResync method createNMContainerStatus.

public static NMContainerStatus createNMContainerStatus(int id, ContainerState containerState) {
    ApplicationId applicationId = ApplicationId.newInstance(0, 1);
    ApplicationAttemptId applicationAttemptId = ApplicationAttemptId.newInstance(applicationId, 1);
    ContainerId containerId = ContainerId.newContainerId(applicationAttemptId, id);
    NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, containerState, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(10), 0);
    return containerReport;
}
Also used : ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 18 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class AbstractYarnScheduler method recoverContainersOnNode.

public void recoverContainersOnNode(List<NMContainerStatus> containerReports, RMNode nm) {
    try {
        writeLock.lock();
        if (!rmContext.isWorkPreservingRecoveryEnabled() || containerReports == null || (containerReports != null && containerReports.isEmpty())) {
            return;
        }
        for (NMContainerStatus container : containerReports) {
            ApplicationId appId = container.getContainerId().getApplicationAttemptId().getApplicationId();
            RMApp rmApp = rmContext.getRMApps().get(appId);
            if (rmApp == null) {
                LOG.error("Skip recovering container " + container + " for unknown application.");
                killOrphanContainerOnNode(nm, container);
                continue;
            }
            SchedulerApplication<T> schedulerApp = applications.get(appId);
            if (schedulerApp == null) {
                LOG.info("Skip recovering container  " + container + " for unknown SchedulerApplication. " + "Application current state is " + rmApp.getState());
                killOrphanContainerOnNode(nm, container);
                continue;
            }
            LOG.info("Recovering container " + container);
            SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
            if (!rmApp.getApplicationSubmissionContext().getKeepContainersAcrossApplicationAttempts()) {
                // Do not recover containers for stopped attempt or previous attempt.
                if (schedulerAttempt.isStopped() || !schedulerAttempt.getApplicationAttemptId().equals(container.getContainerId().getApplicationAttemptId())) {
                    LOG.info("Skip recovering container " + container + " for already stopped attempt.");
                    killOrphanContainerOnNode(nm, container);
                    continue;
                }
            }
            // create container
            RMContainer rmContainer = recoverAndCreateContainer(container, nm);
            // recover RMContainer
            rmContainer.handle(new RMContainerRecoverEvent(container.getContainerId(), container));
            // recover scheduler node
            SchedulerNode schedulerNode = nodeTracker.getNode(nm.getNodeID());
            schedulerNode.recoverContainer(rmContainer);
            // recover queue: update headroom etc.
            Queue queue = schedulerAttempt.getQueue();
            queue.recoverContainer(getClusterResource(), schedulerAttempt, rmContainer);
            // recover scheduler attempt
            schedulerAttempt.recoverContainer(schedulerNode, rmContainer);
            // set master container for the current running AMContainer for this
            // attempt.
            RMAppAttempt appAttempt = rmApp.getCurrentAppAttempt();
            if (appAttempt != null) {
                Container masterContainer = appAttempt.getMasterContainer();
                // container ID stored in AppAttempt.
                if (masterContainer != null && masterContainer.getId().equals(rmContainer.getContainerId())) {
                    ((RMContainerImpl) rmContainer).setAMContainer(true);
                }
            }
            if (schedulerAttempt.getPendingRelease().remove(container.getContainerId())) {
                // release the container
                rmContainer.handle(new RMContainerFinishedEvent(container.getContainerId(), SchedulerUtils.createAbnormalContainerStatus(container.getContainerId(), SchedulerUtils.RELEASED_CONTAINER), RMContainerEventType.RELEASED));
                LOG.info(container.getContainerId() + " is released by application.");
            }
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) RMContainerRecoverEvent(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) RMContainerImpl(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) RMContainerFinishedEvent(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId)

Example 19 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestRMRestart method testQueueMetricsOnRMRestart.

@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testQueueMetricsOnRMRestart() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // PHASE 1: create state in an RM
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    QueueMetrics qm1 = rm1.getResourceScheduler().getRootQueueMetrics();
    resetQueueMetrics(qm1);
    assertQueueMetrics(qm1, 0, 0, 0, 0);
    // create app that gets launched and does allocate before RM restart
    RMApp app1 = rm1.submitApp(200);
    // Need to wait first for AppAttempt to be started (RMAppState.ACCEPTED)
    // and then for it to reach RMAppAttemptState.SCHEDULED
    // inorder to ensure appsPending metric is incremented
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
    rm1.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
    assertQueueMetrics(qm1, 1, 1, 0, 0);
    nm1.nodeHeartbeat(true);
    rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
    nm1.nodeHeartbeat(true);
    List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    assertQueueMetrics(qm1, 1, 0, 1, 0);
    // PHASE 2: create new RM and start from old state
    // create new RM to represent restart and recover state
    MockRM rm2 = createMockRM(conf, memStore);
    QueueMetrics qm2 = rm2.getResourceScheduler().getRootQueueMetrics();
    resetQueueMetrics(qm2);
    assertQueueMetrics(qm2, 0, 0, 0, 0);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // recover app
    RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    nm1.nodeHeartbeat(true);
    nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
    NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(status), null);
    while (loadedApp1.getAppAttempts().size() != 2) {
        Thread.sleep(200);
    }
    attempt1 = loadedApp1.getCurrentAppAttempt();
    attemptId1 = attempt1.getAppAttemptId();
    rm2.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
    assertQueueMetrics(qm2, 1, 1, 0, 0);
    nm1.nodeHeartbeat(true);
    rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    assertQueueMetrics(qm2, 1, 0, 1, 0);
    am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
    nm1.nodeHeartbeat(true);
    conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    // finish the AMs
    finishApplicationMaster(loadedApp1, rm2, nm1, am1);
    // now AppAttempt and App becomes FINISHED,
    // we should also grant APP_ATTEMPT_REMOVE/APP_REMOVE event
    // had processed by scheduler
    rm2.waitForAppRemovedFromScheduler(loadedApp1.getApplicationId());
    assertQueueMetrics(qm2, 1, 0, 0, 1);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) ArrayList(java.util.ArrayList) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) Container(org.apache.hadoop.yarn.api.records.Container) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Example 20 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestRMRestart method testRMRestart.

@SuppressWarnings("rawtypes")
@Test(timeout = 180000)
public void testRMRestart() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
    // PHASE 1: create state in an RM
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    // start like normal because state is empty
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    MockNM nm2 = new MockNM("127.0.0.2:5678", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // nm2 will not heartbeat with RM1
    nm2.registerNode();
    // create app that will finish and the final state should be saved.
    RMApp app0 = rm1.submitApp(200);
    RMAppAttempt attempt0 = app0.getCurrentAppAttempt();
    // spot check that app is saved
    Assert.assertEquals(1, rmAppState.size());
    nm1.nodeHeartbeat(true);
    MockAM am0 = rm1.sendAMLaunched(attempt0.getAppAttemptId());
    am0.registerAppAttempt();
    finishApplicationMaster(app0, rm1, nm1, am0);
    // create app that gets launched and does allocate before RM restart
    RMApp app1 = rm1.submitApp(200);
    // assert app1 info is saved
    ApplicationStateData appState = rmAppState.get(app1.getApplicationId());
    Assert.assertNotNull(appState);
    Assert.assertEquals(0, appState.getAttemptCount());
    Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app1.getApplicationSubmissionContext().getApplicationId());
    //kick the scheduling to allocate AM container
    nm1.nodeHeartbeat(true);
    // assert app1 attempt is saved
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
    rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    Assert.assertEquals(1, appState.getAttemptCount());
    ApplicationAttemptStateData attemptState = appState.getAttempt(attemptId1);
    Assert.assertNotNull(attemptState);
    Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
    // launch the AM
    MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    // AM request for containers
    am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
    // kick the scheduler
    nm1.nodeHeartbeat(true);
    List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    // create app that does not get launched by RM before RM restart
    RMApp app2 = rm1.submitApp(200);
    // assert app2 info is saved
    appState = rmAppState.get(app2.getApplicationId());
    Assert.assertNotNull(appState);
    Assert.assertEquals(0, appState.getAttemptCount());
    Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app2.getApplicationSubmissionContext().getApplicationId());
    // create unmanaged app
    RMApp appUnmanaged = rm1.submitApp(200, "someApp", "someUser", null, true, null, conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS), null);
    ApplicationAttemptId unmanagedAttemptId = appUnmanaged.getCurrentAppAttempt().getAppAttemptId();
    // assert appUnmanaged info is saved
    ApplicationId unmanagedAppId = appUnmanaged.getApplicationId();
    appState = rmAppState.get(unmanagedAppId);
    Assert.assertNotNull(appState);
    // wait for attempt to reach LAUNCHED state 
    rm1.waitForState(unmanagedAttemptId, RMAppAttemptState.LAUNCHED);
    rm1.waitForState(unmanagedAppId, RMAppState.ACCEPTED);
    // assert unmanaged attempt info is saved
    Assert.assertEquals(1, appState.getAttemptCount());
    Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), appUnmanaged.getApplicationSubmissionContext().getApplicationId());
    // PHASE 2: create new RM and start from old state
    // create new RM to represent restart and recover state
    MockRM rm2 = createMockRM(conf, memStore);
    // start new RM
    rm2.start();
    // change NM to point to new RM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    nm2.setResourceTrackerService(rm2.getResourceTrackerService());
    // verify load of old state
    // 4 apps are loaded.
    // FINISHED app and attempt is also loaded back.
    // Unmanaged app state is still loaded back but it cannot be restarted by
    // the RM. this will change with work preserving RM restart in which AMs/NMs
    // are not rebooted.
    Assert.assertEquals(4, rm2.getRMContext().getRMApps().size());
    // check that earlier finished app and attempt is also loaded back and move
    // to finished state.
    rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
    rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
    // verify correct number of attempts and other data
    RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    Assert.assertNotNull(loadedApp1);
    Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
    Assert.assertEquals(app1.getApplicationSubmissionContext().getApplicationId(), loadedApp1.getApplicationSubmissionContext().getApplicationId());
    RMApp loadedApp2 = rm2.getRMContext().getRMApps().get(app2.getApplicationId());
    Assert.assertNotNull(loadedApp2);
    //Assert.assertEquals(0, loadedApp2.getAppAttempts().size());
    Assert.assertEquals(app2.getApplicationSubmissionContext().getApplicationId(), loadedApp2.getApplicationSubmissionContext().getApplicationId());
    // verify state machine kicked into expected states
    rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
    rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.ACCEPTED);
    // verify attempts for apps
    // The app for which AM was started will wait for previous am
    // container finish event to arrive. However for an application for which
    // no am container was running will start new application attempt.
    Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
    Assert.assertEquals(1, loadedApp2.getAppAttempts().size());
    // verify old AM is not accepted
    // change running AM to talk to new RM
    am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
    try {
        am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
        Assert.fail();
    } catch (ApplicationAttemptNotFoundException e) {
        Assert.assertTrue(e instanceof ApplicationAttemptNotFoundException);
    }
    // NM should be rebooted on heartbeat, even first heartbeat for nm2
    NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    hbResponse = nm2.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    // new NM to represent NM re-register
    nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
    nm2 = new MockNM("127.0.0.2:5678", 15120, rm2.getResourceTrackerService());
    NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(status), null);
    nm2.registerNode();
    rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
    // wait for the 2nd attempt to be started.
    int timeoutSecs = 0;
    while (loadedApp1.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
        ;
        Thread.sleep(200);
    }
    // verify no more reboot response sent
    hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
    hbResponse = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
    // assert app1 attempt is saved
    attempt1 = loadedApp1.getCurrentAppAttempt();
    attemptId1 = attempt1.getAppAttemptId();
    rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    appState = rmAppState.get(loadedApp1.getApplicationId());
    attemptState = appState.getAttempt(attemptId1);
    Assert.assertNotNull(attemptState);
    Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
    // Nodes on which the AM's run 
    MockNM am1Node = nm1;
    if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
        am1Node = nm2;
    }
    // assert app2 attempt is saved
    RMAppAttempt attempt2 = loadedApp2.getCurrentAppAttempt();
    ApplicationAttemptId attemptId2 = attempt2.getAppAttemptId();
    rm2.waitForState(attemptId2, RMAppAttemptState.ALLOCATED);
    appState = rmAppState.get(loadedApp2.getApplicationId());
    attemptState = appState.getAttempt(attemptId2);
    Assert.assertNotNull(attemptState);
    Assert.assertEquals(BuilderUtils.newContainerId(attemptId2, 1), attemptState.getMasterContainer().getId());
    MockNM am2Node = nm1;
    if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
        am2Node = nm2;
    }
    // start the AM's
    am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    MockAM am2 = rm2.sendAMLaunched(attempt2.getAppAttemptId());
    am2.registerAppAttempt();
    //request for containers
    am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
    am2.allocate("127.0.0.2", 1000, 1, new ArrayList<ContainerId>());
    // verify container allocate continues to work
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        nm2.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    // finish the AMs
    finishApplicationMaster(loadedApp1, rm2, am1Node, am1);
    finishApplicationMaster(loadedApp2, rm2, am2Node, am2);
    // stop RM's
    rm2.stop();
    rm1.stop();
    // completed apps are not removed immediately after app finish
    // And finished app is also loaded back.
    Assert.assertEquals(4, rmAppState.size());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) ArrayList(java.util.ArrayList) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) Container(org.apache.hadoop.yarn.api.records.Container) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) ApplicationAttemptStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData) Test(org.junit.Test)

Aggregations

NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)39 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)24 Test (org.junit.Test)24 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)22 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)17 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)11 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)11 ArrayList (java.util.ArrayList)10 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)10 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)9 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)8 Container (org.apache.hadoop.yarn.api.records.Container)8 Resource (org.apache.hadoop.yarn.api.records.Resource)7 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)6 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)6 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)6 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)4 NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)4 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)4 RMNodeImpl (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl)4