Search in sources :

Example 1 with RMNodeImpl

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl in project hadoop by apache.

the class TestWorkPreservingRMRestart method testDynamicQueueRecovery.

// Test work preserving recovery of apps running under reservation.
// This involves:
// 1. Setting up a dynamic reservable queue,
// 2. Submitting an app to it,
// 3. Failing over RM,
// 4. Validating that the app is recovered post failover,
// 5. Check if all running containers are recovered,
// 6. Verify the scheduler state like attempt info,
// 7. Verify the queue/user metrics for the dynamic reservable queue.
@Test(timeout = 30000)
public void testDynamicQueueRecovery() throws Exception {
    conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
    conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
    // 1. Set up dynamic reservable queue.
    Configuration schedulerConf = getSchedulerDynamicConfiguration();
    int containerMemory = 1024;
    Resource containerResource = Resource.newInstance(containerMemory, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(schedulerConf);
    rm1 = new MockRM(schedulerConf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    // 2. Run plan follower to update the added node & then submit app to
    // dynamic queue.
    rm1.getRMContext().getReservationSystem().synchronizePlan(ReservationSystemTestUtil.reservationQ, true);
    RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName());
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // clear queue metrics
    rm1.clearQueueMetrics(app1);
    // 3. Fail over (restart) RM.
    rm2 = new MockRM(schedulerConf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // 4. Validate app is recovered post failover.
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
    // Wait for RM to settle down on recovering containers.
    waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
    Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
    assertTrue(launchedContainers.contains(amContainer.getContainerId()));
    assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
    // 5. Check RMContainers are re-recreated and the container state is
    // correct.
    rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
    AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
    SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
    // ********* check scheduler node state.*******
    // 2 running containers.
    Resource usedResources = Resources.multiply(containerResource, 2);
    Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
    assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
    assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
    assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
    // 2 launched containers, 1 completed container
    assertEquals(2, schedulerNode1.getNumContainers());
    assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
    assertEquals(usedResources, schedulerNode1.getAllocatedResource());
    Resource availableResources = Resources.subtract(nmResource, usedResources);
    // 6. Verify the scheduler state like attempt info.
    Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> sa = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
    SchedulerApplication<SchedulerApplicationAttempt> schedulerApp = sa.get(recoveredApp1.getApplicationId());
    // 7. Verify the queue/user metrics for the dynamic reservable queue.
    if (getSchedulerType() == SchedulerType.CAPACITY) {
        checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
    } else {
        checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
    }
    // *********** check scheduler attempt state.********
    SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
    assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
    // *********** check appSchedulingInfo state ***********
    assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) CapacitySchedulerConfiguration(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration) FairSchedulerConfiguration(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) SchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode) SchedulerApplication(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication) DominantResourceCalculator(org.apache.hadoop.yarn.util.resource.DominantResourceCalculator) Resource(org.apache.hadoop.yarn.api.records.Resource) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) SchedulerApplicationAttempt(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt) Test(org.junit.Test)

Example 2 with RMNodeImpl

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl in project hadoop by apache.

the class TestContainerResizing method testSimpleDecreaseContainer.

@Test
public void testSimpleDecreaseContainer() throws Exception {
    /**
     * Application has a container running, try to decrease the container and
     * check queue's usage and container resource will be updated.
     */
    MockRM rm1 = new MockRM() {

        @Override
        public RMNodeLabelsManager createNodeLabelManager() {
            return mgr;
        }
    };
    rm1.start();
    MockNM nm1 = rm1.registerNode("h1:1234", 20 * GB);
    // app1 -> a1
    RMApp app1 = rm1.submitApp(3 * GB, "app", "user", null, "default");
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    FiCaSchedulerApp app = TestUtils.getFiCaSchedulerApp(rm1, app1.getApplicationId());
    checkUsedResource(rm1, "default", 3 * GB, null);
    Assert.assertEquals(3 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
    ContainerId containerId1 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
    sentRMContainerLaunched(rm1, containerId1);
    // am1 asks to change its AM container from 1GB to 3GB
    AllocateResponse response = am1.sendContainerResizingRequest(Arrays.asList(UpdateContainerRequest.newInstance(0, containerId1, ContainerUpdateType.DECREASE_RESOURCE, Resources.createResource(1 * GB), null)));
    verifyContainerDecreased(response, containerId1, 1 * GB);
    checkUsedResource(rm1, "default", 1 * GB, null);
    Assert.assertEquals(1 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
    // Check if decreased containers added to RMNode
    RMNodeImpl rmNode = (RMNodeImpl) rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
    Collection<Container> decreasedContainers = rmNode.getToBeDecreasedContainers();
    boolean rmNodeReceivedDecreaseContainer = false;
    for (Container c : decreasedContainers) {
        if (c.getId().equals(containerId1) && c.getResource().equals(Resources.createResource(1 * GB))) {
            rmNodeReceivedDecreaseContainer = true;
        }
    }
    Assert.assertTrue(rmNodeReceivedDecreaseContainer);
    rm1.close();
}
Also used : AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) UpdatedContainer(org.apache.hadoop.yarn.api.records.UpdatedContainer) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) Test(org.junit.Test)

Example 3 with RMNodeImpl

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl in project hadoop by apache.

the class TestOpportunisticContainerAllocatorAMService method testContainerPromoteAndDemoteBeforeContainerStart.

@Test(timeout = 600000)
public void testContainerPromoteAndDemoteBeforeContainerStart() throws Exception {
    HashMap<NodeId, MockNM> nodes = new HashMap<>();
    MockNM nm1 = new MockNM("h1:1234", 4096, rm.getResourceTrackerService());
    nodes.put(nm1.getNodeId(), nm1);
    MockNM nm2 = new MockNM("h1:4321", 4096, rm.getResourceTrackerService());
    nodes.put(nm2.getNodeId(), nm2);
    MockNM nm3 = new MockNM("h2:1234", 4096, rm.getResourceTrackerService());
    nodes.put(nm3.getNodeId(), nm3);
    MockNM nm4 = new MockNM("h2:4321", 4096, rm.getResourceTrackerService());
    nodes.put(nm4.getNodeId(), nm4);
    nm1.registerNode();
    nm2.registerNode();
    nm3.registerNode();
    nm4.registerNode();
    OpportunisticContainerAllocatorAMService amservice = (OpportunisticContainerAllocatorAMService) rm.getApplicationMasterService();
    RMApp app1 = rm.submitApp(1 * GB, "app", "user", null, "default");
    ApplicationAttemptId attemptId = app1.getCurrentAppAttempt().getAppAttemptId();
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm, nm2);
    ResourceScheduler scheduler = rm.getResourceScheduler();
    RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
    RMNode rmNode3 = rm.getRMContext().getRMNodes().get(nm3.getNodeId());
    RMNode rmNode4 = rm.getRMContext().getRMNodes().get(nm4.getNodeId());
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    ((RMNodeImpl) rmNode1).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode2).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode3).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode4).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    OpportunisticContainerContext ctxt = ((CapacityScheduler) scheduler).getApplicationAttempt(attemptId).getOpportunisticContainerContext();
    // Send add and update node events to AM Service.
    amservice.handle(new NodeAddedSchedulerEvent(rmNode1));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode2));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode3));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode4));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode1));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode2));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode3));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode4));
    // All nodes 1 - 4 will be applicable for scheduling.
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    Thread.sleep(1000);
    QueueMetrics metrics = ((CapacityScheduler) scheduler).getRootQueue().getMetrics();
    // Verify Metrics
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    AllocateResponse allocateResponse = am1.allocate(Arrays.asList(ResourceRequest.newInstance(Priority.newInstance(1), "*", Resources.createResource(1 * GB), 2, true, null, ExecutionTypeRequest.newInstance(ExecutionType.OPPORTUNISTIC, true))), null);
    List<Container> allocatedContainers = allocateResponse.getAllocatedContainers();
    Assert.assertEquals(2, allocatedContainers.size());
    Container container = allocatedContainers.get(0);
    MockNM allocNode = nodes.get(container.getNodeId());
    MockNM sameHostDiffNode = null;
    for (NodeId n : nodes.keySet()) {
        if (n.getHost().equals(allocNode.getNodeId().getHost()) && n.getPort() != allocNode.getNodeId().getPort()) {
            sameHostDiffNode = nodes.get(n);
        }
    }
    // Verify Metrics After OPP allocation (Nothing should change)
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(0, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    // Node on same host should not result in allocation
    sameHostDiffNode.nodeHeartbeat(true);
    Thread.sleep(200);
    allocateResponse = am1.allocate(new ArrayList<>(), new ArrayList<>());
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    // Verify Metrics After OPP allocation (Nothing should change again)
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    // Send Promotion req again... this should result in update error
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(0, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    Assert.assertEquals(1, allocateResponse.getUpdateErrors().size());
    Assert.assertEquals("UPDATE_OUTSTANDING_ERROR", allocateResponse.getUpdateErrors().get(0).getReason());
    Assert.assertEquals(container.getId(), allocateResponse.getUpdateErrors().get(0).getUpdateContainerRequest().getContainerId());
    // Send Promotion req again with incorrect version...
    // this should also result in update error
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(1, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    Assert.assertEquals(1, allocateResponse.getUpdateErrors().size());
    Assert.assertEquals("INCORRECT_CONTAINER_VERSION_ERROR", allocateResponse.getUpdateErrors().get(0).getReason());
    Assert.assertEquals(0, allocateResponse.getUpdateErrors().get(0).getCurrentContainerVersion());
    Assert.assertEquals(container.getId(), allocateResponse.getUpdateErrors().get(0).getUpdateContainerRequest().getContainerId());
    // Ensure after correct node heartbeats, we should get the allocation
    allocNode.nodeHeartbeat(true);
    Thread.sleep(200);
    allocateResponse = am1.allocate(new ArrayList<>(), new ArrayList<>());
    Assert.assertEquals(1, allocateResponse.getUpdatedContainers().size());
    Container uc = allocateResponse.getUpdatedContainers().get(0).getContainer();
    Assert.assertEquals(ExecutionType.GUARANTEED, uc.getExecutionType());
    Assert.assertEquals(uc.getId(), container.getId());
    Assert.assertEquals(uc.getVersion(), container.getVersion() + 1);
    // Verify Metrics After OPP allocation :
    // Allocated cores+mem should have increased, available should decrease
    verifyMetrics(metrics, 14336, 14, 2048, 2, 2);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    Thread.sleep(200);
    // Verify that the container is still in ACQUIRED state wrt the RM.
    RMContainer rmContainer = ((CapacityScheduler) scheduler).getApplicationAttempt(uc.getId().getApplicationAttemptId()).getRMContainer(uc.getId());
    Assert.assertEquals(RMContainerState.ACQUIRED, rmContainer.getState());
    // Now demote the container back..
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(uc.getVersion(), uc.getId(), ContainerUpdateType.DEMOTE_EXECUTION_TYPE, null, ExecutionType.OPPORTUNISTIC)));
    // This should happen in the same heartbeat..
    Assert.assertEquals(1, allocateResponse.getUpdatedContainers().size());
    uc = allocateResponse.getUpdatedContainers().get(0).getContainer();
    Assert.assertEquals(ExecutionType.OPPORTUNISTIC, uc.getExecutionType());
    Assert.assertEquals(uc.getId(), container.getId());
    Assert.assertEquals(uc.getVersion(), container.getVersion() + 2);
    // Verify Metrics After OPP allocation :
    // Everything should have reverted to what it was
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) NodeAddedSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) DistributedSchedulingAllocateResponse(org.apache.hadoop.yarn.server.api.protocolrecords.DistributedSchedulingAllocateResponse) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) NodeId(org.apache.hadoop.yarn.api.records.NodeId) ResourceScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler) OpportunisticContainerContext(org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerContext) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) Test(org.junit.Test)

Example 4 with RMNodeImpl

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl in project hadoop by apache.

the class TestApplicationLifetimeMonitor method testApplicationLifetimeOnRMRestart.

@Test(timeout = 180000)
public void testApplicationLifetimeOnRMRestart() throws Exception {
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    nm1.nodeHeartbeat(true);
    long appLifetime = 60L;
    Map<ApplicationTimeoutType, Long> timeouts = new HashMap<ApplicationTimeoutType, Long>();
    timeouts.put(ApplicationTimeoutType.LIFETIME, appLifetime);
    RMApp app1 = rm1.submitApp(200, Priority.newInstance(0), timeouts);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // Re-start RM
    MockRM rm2 = new MockRM(conf, memStore);
    // make sure app has been unregistered with old RM else both will trigger
    // Expire event
    rm1.getRMContext().getRMAppLifetimeMonitor().unregisterApp(app1.getApplicationId(), ApplicationTimeoutType.LIFETIME);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // recover app
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer), null);
    // Wait for RM to settle down on recovering containers;
    TestWorkPreservingRMRestart.waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
    Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
    assertTrue(launchedContainers.contains(amContainer.getContainerId()));
    assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
    // check RMContainers are re-recreated and the container state is correct.
    rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
    // re register attempt to rm2
    rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.ACCEPTED);
    am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
    am1.registerAppAttempt();
    rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.RUNNING);
    // wait for app life time and application to be in killed state.
    rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.KILLED);
    Assert.assertTrue("Application killed before lifetime value", recoveredApp1.getFinishTime() > (recoveredApp1.getSubmitTime() + appLifetime * 1000));
}
Also used : HashMap(java.util.HashMap) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationTimeoutType(org.apache.hadoop.yarn.api.records.ApplicationTimeoutType) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) Test(org.junit.Test)

Example 5 with RMNodeImpl

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl in project hadoop by apache.

the class TestWorkPreservingRMRestart method testSchedulerRecovery.

// Test common scheduler state including SchedulerAttempt, SchedulerNode,
// AppSchedulingInfo can be reconstructed via the container recovery reports
// on NM re-registration.
// Also test scheduler specific changes: i.e. Queue recovery-
// CSQueue/FSQueue/FifoQueue recovery respectively.
// Test Strategy: send 3 container recovery reports(AMContainer, running
// container, completed container) on NM re-registration, check the states of
// SchedulerAttempt, SchedulerNode etc. are updated accordingly.
@Test(timeout = 20000)
public void testSchedulerRecovery() throws Exception {
    conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
    conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
    int containerMemory = 1024;
    Resource containerResource = Resource.newInstance(containerMemory, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // clear queue metrics
    rm1.clearQueueMetrics(app1);
    // Re-start RM
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // recover app
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
    // Wait for RM to settle down on recovering containers;
    waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
    Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
    assertTrue(launchedContainers.contains(amContainer.getContainerId()));
    assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
    // check RMContainers are re-recreated and the container state is correct.
    rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
    AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
    SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
    assertTrue("SchedulerNode#toString is not in expected format", schedulerNode1.toString().contains(schedulerNode1.getUnallocatedResource().toString()));
    assertTrue("SchedulerNode#toString is not in expected format", schedulerNode1.toString().contains(schedulerNode1.getAllocatedResource().toString()));
    // ********* check scheduler node state.*******
    // 2 running containers.
    Resource usedResources = Resources.multiply(containerResource, 2);
    Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
    assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
    assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
    assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
    // 2 launched containers, 1 completed container
    assertEquals(2, schedulerNode1.getNumContainers());
    assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
    assertEquals(usedResources, schedulerNode1.getAllocatedResource());
    Resource availableResources = Resources.subtract(nmResource, usedResources);
    // ***** check queue state based on the underlying scheduler ********
    Map<ApplicationId, SchedulerApplication> schedulerApps = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
    SchedulerApplication schedulerApp = schedulerApps.get(recoveredApp1.getApplicationId());
    if (getSchedulerType() == SchedulerType.CAPACITY) {
        checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
    } else {
        checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
    }
    // *********** check scheduler attempt state.********
    SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
    assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
    // *********** check appSchedulingInfo state ***********
    assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) SchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode) SchedulerApplication(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication) DominantResourceCalculator(org.apache.hadoop.yarn.util.resource.DominantResourceCalculator) Resource(org.apache.hadoop.yarn.api.records.Resource) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) SchedulerApplicationAttempt(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt) Test(org.junit.Test)

Aggregations

RMNodeImpl (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl)61 Test (org.junit.Test)45 NodeId (org.apache.hadoop.yarn.api.records.NodeId)20 RMNodeEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent)20 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)12 Resource (org.apache.hadoop.yarn.api.records.Resource)11 RMNodeStatusEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent)11 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)9 RMNode (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode)9 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)8 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)8 ArrayList (java.util.ArrayList)7 NodeStatus (org.apache.hadoop.yarn.server.api.records.NodeStatus)7 RMNodeStartedEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStartedEvent)7 NodeHealthStatus (org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)6 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)5 NodeAddedSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent)5 NodeUpdateSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent)5 ClientResponse (com.sun.jersey.api.client.ClientResponse)4 WebResource (com.sun.jersey.api.client.WebResource)4