use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestWorkPreservingRMRestart method testDynamicQueueRecovery.
// Test work preserving recovery of apps running under reservation.
// This involves:
// 1. Setting up a dynamic reservable queue,
// 2. Submitting an app to it,
// 3. Failing over RM,
// 4. Validating that the app is recovered post failover,
// 5. Check if all running containers are recovered,
// 6. Verify the scheduler state like attempt info,
// 7. Verify the queue/user metrics for the dynamic reservable queue.
@Test(timeout = 30000)
public void testDynamicQueueRecovery() throws Exception {
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
// 1. Set up dynamic reservable queue.
Configuration schedulerConf = getSchedulerDynamicConfiguration();
int containerMemory = 1024;
Resource containerResource = Resource.newInstance(containerMemory, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(schedulerConf);
rm1 = new MockRM(schedulerConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
// 2. Run plan follower to update the added node & then submit app to
// dynamic queue.
rm1.getRMContext().getReservationSystem().synchronizePlan(ReservationSystemTestUtil.reservationQ, true);
RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName());
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics
rm1.clearQueueMetrics(app1);
// 3. Fail over (restart) RM.
rm2 = new MockRM(schedulerConf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// 4. Validate app is recovered post failover.
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
// Wait for RM to settle down on recovering containers.
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
// 5. Check RMContainers are re-recreated and the container state is
// correct.
rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
// ********* check scheduler node state.*******
// 2 running containers.
Resource usedResources = Resources.multiply(containerResource, 2);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
// 2 launched containers, 1 completed container
assertEquals(2, schedulerNode1.getNumContainers());
assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
assertEquals(usedResources, schedulerNode1.getAllocatedResource());
Resource availableResources = Resources.subtract(nmResource, usedResources);
// 6. Verify the scheduler state like attempt info.
Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> sa = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication<SchedulerApplicationAttempt> schedulerApp = sa.get(recoveredApp1.getApplicationId());
// 7. Verify the queue/user metrics for the dynamic reservable queue.
if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
}
// *********** check scheduler attempt state.********
SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
// *********** check appSchedulingInfo state ***********
assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestWorkPreservingRMRestart method testAppFailedToRenewTokenOnRecovery.
@Test(timeout = 30000)
public void testAppFailedToRenewTokenOnRecovery() throws Exception {
conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
UserGroupInformation.setConfiguration(conf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new TestSecurityMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
MockRM rm2 = new TestSecurityMockRM(conf, memStore) {
protected DelegationTokenRenewer createDelegationTokenRenewer() {
return new DelegationTokenRenewer() {
@Override
public void addApplicationSync(ApplicationId applicationId, Credentials ts, boolean shouldCancelAtEnd, String user) throws IOException {
throw new IOException("Token renew failed !!");
}
};
}
};
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
rm2.start();
NMContainerStatus containerStatus = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
nm1.registerNode(Arrays.asList(containerStatus), null);
// am re-register
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
am1.registerAppAttempt(true);
rm2.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
// Because the token expired, am could crash.
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestWorkPreservingRMRestart method createNMContainerStatusForApp.
// create 3 container reports for AM
public static List<NMContainerStatus> createNMContainerStatusForApp(MockAM am) {
List<NMContainerStatus> list = new ArrayList<NMContainerStatus>();
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
list.add(amContainer);
list.add(runningContainer);
list.add(completedContainer);
return list;
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestApplicationCleanup method createNMContainerStatus.
public static NMContainerStatus createNMContainerStatus(ApplicationAttemptId appAttemptId, int id, ContainerState containerState, int memory) {
ContainerId containerId = ContainerId.newContainerId(appAttemptId, id);
NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, containerState, Resource.newInstance(memory, 1), "recover container", 0, Priority.newInstance(0), 0);
return containerReport;
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestApplicationCleanup method testProcessingNMContainerStatusesOnNMRestart.
// The test verifies processing of NMContainerStatuses which are sent during
// NM registration.
// 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM
// 2. AM sends ResourceRequest for 1 container with memory 2048MB.
// 3. Verify for number of container allocated by RM
// 4. Verify Memory Usage by cluster, it should be 3072. AM memory + requested
// memory. 1024 + 2048=3072
// 5. Re-register NM by sending completed container status
// 6. Verify for Memory Used, it should be 1024
// 7. Send AM heatbeat to RM. Allocated response should contain completed
// container.
@Test(timeout = 60000)
public void testProcessingNMContainerStatusesOnNMRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
int nmMemory = 8192;
int amMemory = 1024;
int containerMemory = 2048;
MockNM nm1 = new MockNM("127.0.0.1:1234", nmMemory, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app0 = rm1.submitApp(amMemory);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
// 2. AM sends ResourceRequest for 1 container with memory 2048MB.
int noOfContainers = 1;
List<Container> allocateContainers = am0.allocateAndWaitForContainers(noOfContainers, containerMemory, nm1);
// 3. Verify for number of container allocated by RM
Assert.assertEquals(noOfContainers, allocateContainers.size());
Container container = allocateContainers.get(0);
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), container.getId().getContainerId(), ContainerState.RUNNING);
rm1.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
// 4. Verify Memory Usage by cluster, it should be 3072. AM memory +
// requested memory. 1024 + 2048=3072
ResourceScheduler rs = rm1.getRMContext().getScheduler();
long allocatedMB = rs.getRootQueueMetrics().getAllocatedMB();
Assert.assertEquals(amMemory + containerMemory, allocatedMB);
// 5. Re-register NM by sending completed container status
List<NMContainerStatus> nMContainerStatusForApp = createNMContainerStatusForApp(am0);
nm1.registerNode(nMContainerStatusForApp, Arrays.asList(app0.getApplicationId()));
waitForClusterMemory(nm1, rs, amMemory);
// 6. Verify for Memory Used, it should be 1024
Assert.assertEquals(amMemory, rs.getRootQueueMetrics().getAllocatedMB());
// 7. Send AM heatbeat to RM. Allocated response should contain completed
// container
AllocateRequest req = AllocateRequest.newInstance(0, 0F, new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>(), null);
AllocateResponse allocate = am0.allocate(req);
List<ContainerStatus> completedContainersStatuses = allocate.getCompletedContainersStatuses();
Assert.assertEquals(noOfContainers, completedContainersStatuses.size());
// Application clean up should happen Cluster memory used is 0
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
waitForClusterMemory(nm1, rs, 0);
rm1.stop();
}
Aggregations