use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler in project hadoop by apache.
the class TestWorkPreservingRMRestart method testDynamicQueueRecovery.
// Test work preserving recovery of apps running under reservation.
// This involves:
// 1. Setting up a dynamic reservable queue,
// 2. Submitting an app to it,
// 3. Failing over RM,
// 4. Validating that the app is recovered post failover,
// 5. Check if all running containers are recovered,
// 6. Verify the scheduler state like attempt info,
// 7. Verify the queue/user metrics for the dynamic reservable queue.
@Test(timeout = 30000)
public void testDynamicQueueRecovery() throws Exception {
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
// 1. Set up dynamic reservable queue.
Configuration schedulerConf = getSchedulerDynamicConfiguration();
int containerMemory = 1024;
Resource containerResource = Resource.newInstance(containerMemory, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(schedulerConf);
rm1 = new MockRM(schedulerConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
// 2. Run plan follower to update the added node & then submit app to
// dynamic queue.
rm1.getRMContext().getReservationSystem().synchronizePlan(ReservationSystemTestUtil.reservationQ, true);
RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName());
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics
rm1.clearQueueMetrics(app1);
// 3. Fail over (restart) RM.
rm2 = new MockRM(schedulerConf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// 4. Validate app is recovered post failover.
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
// Wait for RM to settle down on recovering containers.
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
// 5. Check RMContainers are re-recreated and the container state is
// correct.
rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
// ********* check scheduler node state.*******
// 2 running containers.
Resource usedResources = Resources.multiply(containerResource, 2);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
// 2 launched containers, 1 completed container
assertEquals(2, schedulerNode1.getNumContainers());
assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
assertEquals(usedResources, schedulerNode1.getAllocatedResource());
Resource availableResources = Resources.subtract(nmResource, usedResources);
// 6. Verify the scheduler state like attempt info.
Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> sa = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication<SchedulerApplicationAttempt> schedulerApp = sa.get(recoveredApp1.getApplicationId());
// 7. Verify the queue/user metrics for the dynamic reservable queue.
if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
}
// *********** check scheduler attempt state.********
SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
// *********** check appSchedulingInfo state ***********
assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler in project hadoop by apache.
the class TestNMReconnect method testCompareRMNodeAfterReconnect.
@Test
public void testCompareRMNodeAfterReconnect() throws Exception {
AbstractYarnScheduler scheduler = getScheduler();
Configuration yarnConf = new YarnConfiguration();
ConfigurationProvider configurationProvider = ConfigurationProviderFactory.getConfigurationProvider(yarnConf);
configurationProvider.init(yarnConf);
context.setConfigurationProvider(configurationProvider);
RMNodeLabelsManager nlm = new RMNodeLabelsManager();
nlm.init(yarnConf);
nlm.start();
context.setNodeLabelManager(nlm);
scheduler.setRMContext(context);
scheduler.init(yarnConf);
scheduler.start();
dispatcher.register(SchedulerEventType.class, scheduler);
String hostname1 = "localhost1";
Resource capability = BuilderUtils.newResource(4096, 4);
RegisterNodeManagerRequest request1 = recordFactory.newRecordInstance(RegisterNodeManagerRequest.class);
NodeId nodeId1 = NodeId.newInstance(hostname1, 0);
request1.setNodeId(nodeId1);
request1.setHttpPort(0);
request1.setResource(capability);
resourceTrackerService.registerNodeManager(request1);
Assert.assertNotNull(context.getRMNodes().get(nodeId1));
// verify Scheduler and RMContext use same RMNode reference.
Assert.assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() == context.getRMNodes().get(nodeId1));
Assert.assertEquals(context.getRMNodes().get(nodeId1).getTotalCapability(), capability);
Resource capability1 = BuilderUtils.newResource(2048, 2);
request1.setResource(capability1);
resourceTrackerService.registerNodeManager(request1);
Assert.assertNotNull(context.getRMNodes().get(nodeId1));
// verify Scheduler and RMContext use same RMNode reference
// after reconnect.
Assert.assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() == context.getRMNodes().get(nodeId1));
// verify RMNode's capability is changed.
Assert.assertEquals(context.getRMNodes().get(nodeId1).getTotalCapability(), capability1);
nlm.stop();
scheduler.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler in project hadoop by apache.
the class TestWorkPreservingRMRestart method testUAMRecoveryOnRMWorkPreservingRestart.
@Test(timeout = 600000)
public void testUAMRecoveryOnRMWorkPreservingRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// start RM
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// create app and launch the UAM
RMApp app0 = rm1.submitApp(200, true);
MockAM am0 = MockRM.launchUAM(app0, rm1, nm1);
am0.registerAppAttempt();
// Allocate containers to UAM
int numContainers = 2;
am0.allocate("127.0.0.1", 1000, numContainers, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
List<Container> conts = am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
Assert.assertTrue(conts.isEmpty());
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
Assert.assertFalse(conts.isEmpty());
// start new RM
rm2 = new MockRM(conf, memStore);
rm2.start();
rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
// recover app
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
RMApp recoveredApp = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
NMContainerStatus container1 = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus container2 = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 2, ContainerState.RUNNING);
nm1.registerNode(Arrays.asList(container1, container2), null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
// retry registerApplicationMaster() after RM restart.
am0.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
am0.registerAppAttempt(true);
// Check if UAM is correctly recovered on restart
rm2.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
// Check if containers allocated to UAM are recovered
Map<ApplicationId, SchedulerApplication> schedulerApps = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication schedulerApp = schedulerApps.get(recoveredApp.getApplicationId());
SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
Assert.assertEquals(numContainers, schedulerAttempt.getLiveContainers().size());
// Check if UAM is able to heart beat
Assert.assertNotNull(am0.doHeartbeat());
// Complete the UAM
am0.unregisterAppAttempt(false);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, recoveredApp.getFinalApplicationStatus());
// Restart RM once more to check UAM is not re-run
MockRM rm3 = new MockRM(conf, memStore);
rm3.start();
recoveredApp = rm3.getRMContext().getRMApps().get(app0.getApplicationId());
Assert.assertEquals(RMAppState.FINISHED, recoveredApp.getState());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler in project hadoop by apache.
the class TestWorkPreservingRMRestart method testSchedulerRecovery.
// Test common scheduler state including SchedulerAttempt, SchedulerNode,
// AppSchedulingInfo can be reconstructed via the container recovery reports
// on NM re-registration.
// Also test scheduler specific changes: i.e. Queue recovery-
// CSQueue/FSQueue/FifoQueue recovery respectively.
// Test Strategy: send 3 container recovery reports(AMContainer, running
// container, completed container) on NM re-registration, check the states of
// SchedulerAttempt, SchedulerNode etc. are updated accordingly.
@Test(timeout = 20000)
public void testSchedulerRecovery() throws Exception {
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
int containerMemory = 1024;
Resource containerResource = Resource.newInstance(containerMemory, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics
rm1.clearQueueMetrics(app1);
// Re-start RM
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// recover app
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
// check RMContainers are re-recreated and the container state is correct.
rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
assertTrue("SchedulerNode#toString is not in expected format", schedulerNode1.toString().contains(schedulerNode1.getUnallocatedResource().toString()));
assertTrue("SchedulerNode#toString is not in expected format", schedulerNode1.toString().contains(schedulerNode1.getAllocatedResource().toString()));
// ********* check scheduler node state.*******
// 2 running containers.
Resource usedResources = Resources.multiply(containerResource, 2);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
// 2 launched containers, 1 completed container
assertEquals(2, schedulerNode1.getNumContainers());
assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
assertEquals(usedResources, schedulerNode1.getAllocatedResource());
Resource availableResources = Resources.subtract(nmResource, usedResources);
// ***** check queue state based on the underlying scheduler ********
Map<ApplicationId, SchedulerApplication> schedulerApps = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication schedulerApp = schedulerApps.get(recoveredApp1.getApplicationId());
if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
}
// *********** check scheduler attempt state.********
SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
// *********** check appSchedulingInfo state ***********
assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler in project hadoop by apache.
the class TestFifoScheduler method testAddAndRemoveAppFromFiFoScheduler.
@Test
public void testAddAndRemoveAppFromFiFoScheduler() throws Exception {
Configuration conf = new Configuration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, FifoScheduler.class, ResourceScheduler.class);
MockRM rm = new MockRM(conf);
@SuppressWarnings("unchecked") AbstractYarnScheduler<SchedulerApplicationAttempt, SchedulerNode> fs = (AbstractYarnScheduler<SchedulerApplicationAttempt, SchedulerNode>) rm.getResourceScheduler();
TestSchedulerUtils.verifyAppAddedAndRemovedFromScheduler(fs.getSchedulerApplications(), fs, "queue");
}
Aggregations