use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler in project hadoop by apache.
the class TestRMWebServices method checkSchedulerLogFileAndCleanup.
private void checkSchedulerLogFileAndCleanup() {
String targetFile;
ResourceScheduler scheduler = rm.getResourceScheduler();
if (scheduler instanceof FairScheduler) {
targetFile = "yarn-fair-scheduler-debug.log";
} else if (scheduler instanceof CapacityScheduler) {
targetFile = "yarn-capacity-scheduler-debug.log";
} else {
targetFile = "yarn-scheduler-debug.log";
}
File logFile = new File(System.getProperty("yarn.log.dir"), targetFile);
assertTrue("scheduler log file doesn't exist", logFile.exists());
FileUtils.deleteQuietly(logFile);
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler in project hadoop by apache.
the class TestRMWebApp method mockCapacityScheduler.
public static CapacityScheduler mockCapacityScheduler() throws IOException {
// stolen from TestCapacityScheduler
CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration();
setupQueueConfiguration(conf);
CapacityScheduler cs = new CapacityScheduler();
cs.setConf(new YarnConfiguration());
RMContext rmContext = new RMContextImpl(null, null, null, null, null, null, new RMContainerTokenSecretManager(conf), new NMTokenSecretManagerInRM(conf), new ClientToAMTokenSecretManagerInRM(), null);
rmContext.setNodeLabelManager(new NullRMNodeLabelsManager());
cs.setRMContext(rmContext);
cs.init(conf);
return cs;
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler in project hadoop by apache.
the class TestAMRestart method testPreemptedAMRestartOnRMRestart.
// Test RM restarts after AM container is preempted, new RM should not count
// AM preemption failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 20000)
public void testPreemptedAMRestartOnRMRestart() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
CapacityScheduler scheduler = (CapacityScheduler) rm1.getResourceScheduler();
ContainerId amContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
// Forcibly preempt the am container;
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
Assert.assertTrue(!attempt1.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
// state store has 1 attempt stored.
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
Assert.assertEquals(1, appState.getAttemptCount());
// attempt stored has the preempted container exit status.
Assert.assertEquals(ContainerExitStatus.PREEMPTED, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
// Restart rm.
MockRM rm2 = new MockRM(conf, memStore);
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm1.registerNode();
rm2.start();
// Restarted RM should re-launch the am.
MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
RMAppAttempt attempt2 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
Assert.assertTrue(attempt2.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler in project hadoop by apache.
the class TestAMRestart method testShouldNotCountFailureToMaxAttemptRetry.
// AM container preempted, nm disk failure
// should not be counted towards AM max retry count.
@Test(timeout = 100000)
public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
CapacityScheduler scheduler = (CapacityScheduler) rm1.getResourceScheduler();
ContainerId amContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
// Preempt the first attempt;
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am1.getApplicationAttemptId());
Assert.assertTrue(!attempt1.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// AM should be restarted even though max-am-attempt is 1.
MockAM am2 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
// Preempt the second attempt.
ContainerId amContainer2 = ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer2));
rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am2.getApplicationAttemptId());
Assert.assertTrue(!attempt2.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am3 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 3, nm1);
RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
// mimic NM disk_failure
ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
containerStatus.setContainerId(attempt3.getMasterContainer().getId());
containerStatus.setDiagnostics("mimic NM disk_failure");
containerStatus.setState(ContainerState.COMPLETE);
containerStatus.setExitStatus(ContainerExitStatus.DISKS_FAILED);
Map<ApplicationId, List<ContainerStatus>> conts = new HashMap<ApplicationId, List<ContainerStatus>>();
conts.put(app1.getApplicationId(), Collections.singletonList(containerStatus));
nm1.nodeHeartbeat(conts, true);
rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am3.getApplicationAttemptId());
Assert.assertTrue(!attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.DISKS_FAILED, appState.getAttempt(am3.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am4 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
RMAppAttempt attempt4 = app1.getCurrentAppAttempt();
// create second NM, and register to rm1
MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.getResourceTrackerService());
nm2.registerNode();
// nm1 heartbeats to report unhealthy
// This will mimic ContainerExitStatus.ABORT
nm1.nodeHeartbeat(false);
rm1.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am4.getApplicationAttemptId());
Assert.assertTrue(!attempt4.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.ABORTED, appState.getAttempt(am4.getApplicationAttemptId()).getAMContainerExitStatus());
// launch next AM in nm2
MockAM am5 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm2);
RMAppAttempt attempt5 = app1.getCurrentAppAttempt();
// fail the AM normally
nm2.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am5.getApplicationAttemptId());
Assert.assertTrue(attempt5.shouldCountTowardsMaxAttemptRetry());
// AM should not be restarted.
rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
Assert.assertEquals(5, app1.getAppAttempts().size());
rm1.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler in project hadoop by apache.
the class TestSchedulerHealth method testCapacitySchedulerAllocation.
@Test
public void testCapacitySchedulerAllocation() throws Exception {
setup();
boolean isCapacityScheduler = resourceManager.getResourceScheduler() instanceof CapacityScheduler;
assumeTrue("This test is only supported on Capacity Scheduler", isCapacityScheduler);
// Register node1
String host_0 = "host_0";
NodeManager nm_0 = registerNode(host_0, 1234, 2345, NetworkTopology.DEFAULT_RACK, Resources.createResource(5 * 1024, 1));
// ResourceRequest priorities
Priority priority_0 = Priority.newInstance(0);
Priority priority_1 = Priority.newInstance(1);
// Submit an application
Application application_0 = new Application("user_0", "default", resourceManager);
application_0.submit();
application_0.addNodeManager(host_0, 1234, nm_0);
Resource capability_0_0 = Resources.createResource(1024, 1);
application_0.addResourceRequestSpec(priority_1, capability_0_0);
Resource capability_0_1 = Resources.createResource(2 * 1024, 1);
application_0.addResourceRequestSpec(priority_0, capability_0_1);
Task task_0_0 = new Task(application_0, priority_1, new String[] { host_0 });
application_0.addTask(task_0_0);
Task task_0_1 = new Task(application_0, priority_0, new String[] { host_0 });
application_0.addTask(task_0_1);
// Send resource requests to the scheduler
application_0.schedule();
// Send a heartbeat to kick the tires on the Scheduler
nodeUpdate(nm_0);
SchedulerHealth sh = ((CapacityScheduler) resourceManager.getResourceScheduler()).getSchedulerHealth();
// Now SchedulerHealth records last container allocated, aggregated
// allocation account will not be changed
Assert.assertEquals(1, sh.getAllocationCount().longValue());
Assert.assertEquals(Resource.newInstance(1 * 1024, 1), sh.getResourcesAllocated());
Assert.assertEquals(2, sh.getAggregateAllocationCount().longValue());
Assert.assertEquals("host_0:1234", sh.getLastAllocationDetails().getNodeId().toString());
Assert.assertEquals("root.default", sh.getLastAllocationDetails().getQueue());
Task task_0_2 = new Task(application_0, priority_0, new String[] { host_0 });
application_0.addTask(task_0_2);
application_0.schedule();
nodeUpdate(nm_0);
Assert.assertEquals(1, sh.getAllocationCount().longValue());
Assert.assertEquals(Resource.newInstance(2 * 1024, 1), sh.getResourcesAllocated());
Assert.assertEquals(3, sh.getAggregateAllocationCount().longValue());
Assert.assertEquals("host_0:1234", sh.getLastAllocationDetails().getNodeId().toString());
Assert.assertEquals("root.default", sh.getLastAllocationDetails().getQueue());
}
Aggregations