use of org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState in project hadoop by apache.
the class TestApplicationPriority method testRMRestartWithChangeInPriority.
@Test(timeout = 180000)
public void testRMRestartWithChangeInPriority() throws Exception {
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
conf.setInt(YarnConfiguration.MAX_CLUSTER_LEVEL_APPLICATION_PRIORITY, 10);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// PHASE 1: create state in an RM
// start RM
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
Priority appPriority1 = Priority.newInstance(5);
RMApp app1 = rm1.submitApp(1 * GB, appPriority1);
// kick the scheduler, 1 GB given to AM1, remaining 15GB on nm1
MockAM am1 = MockRM.launchAM(app1, rm1, nm1);
am1.registerAppAttempt();
// get scheduler
CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
// Change the priority of App1 to 8
Priority appPriority2 = Priority.newInstance(8);
UserGroupInformation ugi = UserGroupInformation.createRemoteUser(app1.getUser());
cs.updateApplicationPriority(appPriority2, app1.getApplicationId(), null, ugi);
// let things settle down
Thread.sleep(1000);
// create new RM to represent restart and recover state
MockRM rm2 = new MockRM(conf, memStore);
// start new RM
rm2.start();
// change NM to point to new RM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// Verify RM Apps after this restart
Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
// get scheduler app
RMApp loadedApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
// Verify whether priority 15 is reset to 10
Assert.assertEquals(appPriority2, loadedApp.getApplicationPriority());
rm2.stop();
rm1.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState in project hadoop by apache.
the class TestRMRestart method testRMRestartAppRunningAMFailed.
@Test(timeout = 60000)
public void testRMRestartAppRunningAMFailed() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// create app and launch the AM
RMApp app0 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", true, true);
MockAM am0 = launchAM(app0, rm1, nm1);
// fail the AM by sending CONTAINER_FINISHED event without registering.
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ApplicationStateData appState = rmAppState.get(app0.getApplicationId());
// assert the AM failed state is saved.
Assert.assertEquals(RMAppAttemptState.FAILED, appState.getAttempt(am0.getApplicationAttemptId()).getState());
// assert app state has not been saved.
Assert.assertNull(rmAppState.get(app0.getApplicationId()).getState());
// new AM started but not registered, app still stays at ACCECPTED state.
rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
// start new RM
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
// assert the previous AM state is loaded back on RM recovery.
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState in project hadoop by apache.
the class TestRMRestart method testRMRestart.
@SuppressWarnings("rawtypes")
@Test(timeout = 180000)
public void testRMRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// PHASE 1: create state in an RM
// start RM
MockRM rm1 = createMockRM(conf, memStore);
// start like normal because state is empty
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
MockNM nm2 = new MockNM("127.0.0.2:5678", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// nm2 will not heartbeat with RM1
nm2.registerNode();
// create app that will finish and the final state should be saved.
RMApp app0 = rm1.submitApp(200);
RMAppAttempt attempt0 = app0.getCurrentAppAttempt();
// spot check that app is saved
Assert.assertEquals(1, rmAppState.size());
nm1.nodeHeartbeat(true);
MockAM am0 = rm1.sendAMLaunched(attempt0.getAppAttemptId());
am0.registerAppAttempt();
finishApplicationMaster(app0, rm1, nm1, am0);
// create app that gets launched and does allocate before RM restart
RMApp app1 = rm1.submitApp(200);
// assert app1 info is saved
ApplicationStateData appState = rmAppState.get(app1.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app1.getApplicationSubmissionContext().getApplicationId());
//kick the scheduling to allocate AM container
nm1.nodeHeartbeat(true);
// assert app1 attempt is saved
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
Assert.assertEquals(1, appState.getAttemptCount());
ApplicationAttemptStateData attemptState = appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
// launch the AM
MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
// AM request for containers
am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
// kick the scheduler
nm1.nodeHeartbeat(true);
List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// create app that does not get launched by RM before RM restart
RMApp app2 = rm1.submitApp(200);
// assert app2 info is saved
appState = rmAppState.get(app2.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app2.getApplicationSubmissionContext().getApplicationId());
// create unmanaged app
RMApp appUnmanaged = rm1.submitApp(200, "someApp", "someUser", null, true, null, conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS), null);
ApplicationAttemptId unmanagedAttemptId = appUnmanaged.getCurrentAppAttempt().getAppAttemptId();
// assert appUnmanaged info is saved
ApplicationId unmanagedAppId = appUnmanaged.getApplicationId();
appState = rmAppState.get(unmanagedAppId);
Assert.assertNotNull(appState);
// wait for attempt to reach LAUNCHED state
rm1.waitForState(unmanagedAttemptId, RMAppAttemptState.LAUNCHED);
rm1.waitForState(unmanagedAppId, RMAppState.ACCEPTED);
// assert unmanaged attempt info is saved
Assert.assertEquals(1, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), appUnmanaged.getApplicationSubmissionContext().getApplicationId());
// PHASE 2: create new RM and start from old state
// create new RM to represent restart and recover state
MockRM rm2 = createMockRM(conf, memStore);
// start new RM
rm2.start();
// change NM to point to new RM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm2.setResourceTrackerService(rm2.getResourceTrackerService());
// verify load of old state
// 4 apps are loaded.
// FINISHED app and attempt is also loaded back.
// Unmanaged app state is still loaded back but it cannot be restarted by
// the RM. this will change with work preserving RM restart in which AMs/NMs
// are not rebooted.
Assert.assertEquals(4, rm2.getRMContext().getRMApps().size());
// check that earlier finished app and attempt is also loaded back and move
// to finished state.
rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
// verify correct number of attempts and other data
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
Assert.assertNotNull(loadedApp1);
Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
Assert.assertEquals(app1.getApplicationSubmissionContext().getApplicationId(), loadedApp1.getApplicationSubmissionContext().getApplicationId());
RMApp loadedApp2 = rm2.getRMContext().getRMApps().get(app2.getApplicationId());
Assert.assertNotNull(loadedApp2);
//Assert.assertEquals(0, loadedApp2.getAppAttempts().size());
Assert.assertEquals(app2.getApplicationSubmissionContext().getApplicationId(), loadedApp2.getApplicationSubmissionContext().getApplicationId());
// verify state machine kicked into expected states
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.ACCEPTED);
// verify attempts for apps
// The app for which AM was started will wait for previous am
// container finish event to arrive. However for an application for which
// no am container was running will start new application attempt.
Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
Assert.assertEquals(1, loadedApp2.getAppAttempts().size());
// verify old AM is not accepted
// change running AM to talk to new RM
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
try {
am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
Assert.fail();
} catch (ApplicationAttemptNotFoundException e) {
Assert.assertTrue(e instanceof ApplicationAttemptNotFoundException);
}
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
nm2 = new MockNM("127.0.0.2:5678", 15120, rm2.getResourceTrackerService());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
nm2.registerNode();
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
// wait for the 2nd attempt to be started.
int timeoutSecs = 0;
while (loadedApp1.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
;
Thread.sleep(200);
}
// verify no more reboot response sent
hbResponse = nm1.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
// assert app1 attempt is saved
attempt1 = loadedApp1.getCurrentAppAttempt();
attemptId1 = attempt1.getAppAttemptId();
rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp1.getApplicationId());
attemptState = appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
// Nodes on which the AM's run
MockNM am1Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
am1Node = nm2;
}
// assert app2 attempt is saved
RMAppAttempt attempt2 = loadedApp2.getCurrentAppAttempt();
ApplicationAttemptId attemptId2 = attempt2.getAppAttemptId();
rm2.waitForState(attemptId2, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp2.getApplicationId());
attemptState = appState.getAttempt(attemptId2);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId2, 1), attemptState.getMasterContainer().getId());
MockNM am2Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
am2Node = nm2;
}
// start the AM's
am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
MockAM am2 = rm2.sendAMLaunched(attempt2.getAppAttemptId());
am2.registerAppAttempt();
//request for containers
am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
am2.allocate("127.0.0.2", 1000, 1, new ArrayList<ContainerId>());
// verify container allocate continues to work
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// finish the AMs
finishApplicationMaster(loadedApp1, rm2, am1Node, am1);
finishApplicationMaster(loadedApp2, rm2, am2Node, am2);
// stop RM's
rm2.stop();
rm1.stop();
// completed apps are not removed immediately after app finish
// And finished app is also loaded back.
Assert.assertEquals(4, rmAppState.size());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState in project hadoop by apache.
the class TestRMRestart method finishApplicationMaster.
private void finishApplicationMaster(RMApp rmApp, MockRM rm, MockNM nm, MockAM am, FinishApplicationMasterRequest req) throws Exception {
RMState rmState = ((MemoryRMStateStore) rm.getRMContext().getStateStore()).getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
am.unregisterAppAttempt(req, true);
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
nm.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
rm.waitForState(rmApp.getApplicationId(), RMAppState.FINISHED);
// check that app/attempt is saved with the final state
ApplicationStateData appState = rmAppState.get(rmApp.getApplicationId());
Assert.assertEquals(RMAppState.FINISHED, appState.getState());
Assert.assertEquals(RMAppAttemptState.FINISHED, appState.getAttempt(am.getApplicationAttemptId()).getState());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState in project hadoop by apache.
the class TestRMRestart method testRMRestartWaitForPreviousSucceededAttempt.
// Test RM restarts after previous attempt succeeded and was saved into state
// store but before the RMAppAttempt notifies RMApp that it has succeeded. On
// recovery, RMAppAttempt should send the AttemptFinished event to RMApp so
// that RMApp can recover its state.
@Test(timeout = 60000)
public void testRMRestartWaitForPreviousSucceededAttempt() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
MemoryRMStateStore memStore = new MemoryRMStateStore() {
int count = 0;
@Override
public void updateApplicationStateInternal(ApplicationId appId, ApplicationStateData appStateData) throws Exception {
if (count == 0) {
// do nothing; simulate app final state is not saved.
LOG.info(appId + " final state is not saved.");
count++;
} else {
super.updateApplicationStateInternal(appId, appStateData);
}
}
};
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = rm1.registerNode("127.0.0.1:1234", 15120);
RMApp app0 = rm1.submitApp(200);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
FinishApplicationMasterRequest req = FinishApplicationMasterRequest.newInstance(FinalApplicationStatus.SUCCEEDED, "", "");
am0.unregisterAppAttempt(req, true);
rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
// app final state is not saved. This guarantees that RMApp cannot be
// recovered via its own saved state, but only via the event notification
// from the RMAppAttempt on recovery.
Assert.assertNull(rmAppState.get(app0.getApplicationId()).getState());
// start RM
MockRM rm2 = createMockRM(conf, memStore);
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
rm2.start();
rm2.waitForState(app0.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.FINISHED);
rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
// app final state is saved via the finish event from attempt.
Assert.assertEquals(RMAppState.FINISHED, rmAppState.get(app0.getApplicationId()).getState());
}
Aggregations