Search in sources :

Example 1 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestWorkPreservingRMRestart method testDynamicQueueRecovery.

// Test work preserving recovery of apps running under reservation.
// This involves:
// 1. Setting up a dynamic reservable queue,
// 2. Submitting an app to it,
// 3. Failing over RM,
// 4. Validating that the app is recovered post failover,
// 5. Check if all running containers are recovered,
// 6. Verify the scheduler state like attempt info,
// 7. Verify the queue/user metrics for the dynamic reservable queue.
@Test(timeout = 30000)
public void testDynamicQueueRecovery() throws Exception {
    conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
    conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
    // 1. Set up dynamic reservable queue.
    Configuration schedulerConf = getSchedulerDynamicConfiguration();
    int containerMemory = 1024;
    Resource containerResource = Resource.newInstance(containerMemory, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(schedulerConf);
    rm1 = new MockRM(schedulerConf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    // 2. Run plan follower to update the added node & then submit app to
    // dynamic queue.
    rm1.getRMContext().getReservationSystem().synchronizePlan(ReservationSystemTestUtil.reservationQ, true);
    RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName());
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // clear queue metrics
    rm1.clearQueueMetrics(app1);
    // 3. Fail over (restart) RM.
    rm2 = new MockRM(schedulerConf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // 4. Validate app is recovered post failover.
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
    // Wait for RM to settle down on recovering containers.
    waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
    Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
    assertTrue(launchedContainers.contains(amContainer.getContainerId()));
    assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
    // 5. Check RMContainers are re-recreated and the container state is
    // correct.
    rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
    rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
    AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
    SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
    // ********* check scheduler node state.*******
    // 2 running containers.
    Resource usedResources = Resources.multiply(containerResource, 2);
    Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
    assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
    assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
    assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
    // 2 launched containers, 1 completed container
    assertEquals(2, schedulerNode1.getNumContainers());
    assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
    assertEquals(usedResources, schedulerNode1.getAllocatedResource());
    Resource availableResources = Resources.subtract(nmResource, usedResources);
    // 6. Verify the scheduler state like attempt info.
    Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> sa = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
    SchedulerApplication<SchedulerApplicationAttempt> schedulerApp = sa.get(recoveredApp1.getApplicationId());
    // 7. Verify the queue/user metrics for the dynamic reservable queue.
    if (getSchedulerType() == SchedulerType.CAPACITY) {
        checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
    } else {
        checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
    }
    // *********** check scheduler attempt state.********
    SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
    assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
    assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
    // *********** check appSchedulingInfo state ***********
    assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) CapacitySchedulerConfiguration(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration) FairSchedulerConfiguration(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) SchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode) SchedulerApplication(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication) DominantResourceCalculator(org.apache.hadoop.yarn.util.resource.DominantResourceCalculator) Resource(org.apache.hadoop.yarn.api.records.Resource) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) SchedulerApplicationAttempt(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt) Test(org.junit.Test)

Example 2 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestWorkPreservingRMRestart method testAppFailedToRenewTokenOnRecovery.

@Test(timeout = 30000)
public void testAppFailedToRenewTokenOnRecovery() throws Exception {
    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    UserGroupInformation.setConfiguration(conf);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new TestSecurityMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    MockRM rm2 = new TestSecurityMockRM(conf, memStore) {

        protected DelegationTokenRenewer createDelegationTokenRenewer() {
            return new DelegationTokenRenewer() {

                @Override
                public void addApplicationSync(ApplicationId applicationId, Credentials ts, boolean shouldCancelAtEnd, String user) throws IOException {
                    throw new IOException("Token renew failed !!");
                }
            };
        }
    };
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    rm2.start();
    NMContainerStatus containerStatus = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    nm1.registerNode(Arrays.asList(containerStatus), null);
    // am re-register
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
    am1.registerAppAttempt(true);
    rm2.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
    // Because the token expired, am could crash.
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) DelegationTokenRenewer(org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) IOException(java.io.IOException) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Credentials(org.apache.hadoop.security.Credentials) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Test(org.junit.Test)

Example 3 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestWorkPreservingRMRestart method createNMContainerStatusForApp.

// create 3 container reports for AM
public static List<NMContainerStatus> createNMContainerStatusForApp(MockAM am) {
    List<NMContainerStatus> list = new ArrayList<NMContainerStatus>();
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    list.add(amContainer);
    list.add(runningContainer);
    list.add(completedContainer);
    return list;
}
Also used : NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ArrayList(java.util.ArrayList)

Example 4 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestApplicationCleanup method createNMContainerStatus.

public static NMContainerStatus createNMContainerStatus(ApplicationAttemptId appAttemptId, int id, ContainerState containerState, int memory) {
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, id);
    NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, containerState, Resource.newInstance(memory, 1), "recover container", 0, Priority.newInstance(0), 0);
    return containerReport;
}
Also used : ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)

Example 5 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestApplicationCleanup method testProcessingNMContainerStatusesOnNMRestart.

// The test verifies processing of NMContainerStatuses which are sent during
// NM registration.
// 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM
// 2. AM sends ResourceRequest for 1 container with memory 2048MB.
// 3. Verify for number of container allocated by RM
// 4. Verify Memory Usage by cluster, it should be 3072. AM memory + requested
// memory. 1024 + 2048=3072
// 5. Re-register NM by sending completed container status
// 6. Verify for Memory Used, it should be 1024
// 7. Send AM heatbeat to RM. Allocated response should contain completed
// container.
@Test(timeout = 60000)
public void testProcessingNMContainerStatusesOnNMRestart() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    int nmMemory = 8192;
    int amMemory = 1024;
    int containerMemory = 2048;
    MockNM nm1 = new MockNM("127.0.0.1:1234", nmMemory, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app0 = rm1.submitApp(amMemory);
    MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
    // 2. AM sends ResourceRequest for 1 container with memory 2048MB.
    int noOfContainers = 1;
    List<Container> allocateContainers = am0.allocateAndWaitForContainers(noOfContainers, containerMemory, nm1);
    // 3. Verify for number of container allocated by RM
    Assert.assertEquals(noOfContainers, allocateContainers.size());
    Container container = allocateContainers.get(0);
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), container.getId().getContainerId(), ContainerState.RUNNING);
    rm1.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
    // 4. Verify Memory Usage by cluster, it should be 3072. AM memory +
    // requested memory. 1024 + 2048=3072
    ResourceScheduler rs = rm1.getRMContext().getScheduler();
    long allocatedMB = rs.getRootQueueMetrics().getAllocatedMB();
    Assert.assertEquals(amMemory + containerMemory, allocatedMB);
    // 5. Re-register NM by sending completed container status
    List<NMContainerStatus> nMContainerStatusForApp = createNMContainerStatusForApp(am0);
    nm1.registerNode(nMContainerStatusForApp, Arrays.asList(app0.getApplicationId()));
    waitForClusterMemory(nm1, rs, amMemory);
    // 6. Verify for Memory Used, it should be 1024
    Assert.assertEquals(amMemory, rs.getRootQueueMetrics().getAllocatedMB());
    // 7. Send AM heatbeat to RM. Allocated response should contain completed
    // container
    AllocateRequest req = AllocateRequest.newInstance(0, 0F, new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>(), null);
    AllocateResponse allocate = am0.allocate(req);
    List<ContainerStatus> completedContainersStatuses = allocate.getCompletedContainersStatuses();
    Assert.assertEquals(noOfContainers, completedContainersStatuses.size());
    // Application clean up should happen Cluster memory used is 0
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    waitForClusterMemory(nm1, rs, 0);
    rm1.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) AllocateRequest(org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) Container(org.apache.hadoop.yarn.api.records.Container) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ResourceScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Aggregations

NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)39 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)24 Test (org.junit.Test)24 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)22 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)17 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)11 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)11 ArrayList (java.util.ArrayList)10 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)10 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)9 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)8 Container (org.apache.hadoop.yarn.api.records.Container)8 Resource (org.apache.hadoop.yarn.api.records.Resource)7 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)6 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)6 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)6 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)4 NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)4 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)4 RMNodeImpl (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl)4