use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestAMRestart method testRMRestartOrFailoverNotCountedForAMFailures.
// Test regular RM restart/failover, new RM should not count
// AM failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 50000)
public void testRMRestartOrFailoverNotCountedForAMFailures() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
// AM should be restarted even though max-am-attempt is 1.
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
// Restart rm.
MockRM rm2 = new MockRM(conf, memStore);
rm2.start();
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// re-register the NM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
status.setContainerId(attempt1.getMasterContainer().getId());
status.setContainerState(ContainerState.COMPLETE);
status.setDiagnostics("");
nm1.registerNode(Collections.singletonList(status), null);
rm2.waitForState(attempt1.getAppAttemptId(), RMAppAttemptState.FAILED);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
// Will automatically start a new AppAttempt in rm2
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
RMAppAttempt attempt3 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
Assert.assertTrue(attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestZKRMStateStore method testDuplicateRMAppDeletion.
@Test
public void testDuplicateRMAppDeletion() throws Exception {
TestZKRMStateStoreTester zkTester = new TestZKRMStateStoreTester();
long submitTime = System.currentTimeMillis();
long startTime = System.currentTimeMillis() + 1234;
RMStateStore store = zkTester.getRMStateStore();
TestDispatcher dispatcher = new TestDispatcher();
store.setRMDispatcher(dispatcher);
ApplicationAttemptId attemptIdRemoved = ApplicationAttemptId.fromString("appattempt_1352994193343_0002_000001");
ApplicationId appIdRemoved = attemptIdRemoved.getApplicationId();
storeApp(store, appIdRemoved, submitTime, startTime);
storeAttempt(store, attemptIdRemoved, "container_1352994193343_0002_01_000001", null, null, dispatcher);
ApplicationSubmissionContext context = new ApplicationSubmissionContextPBImpl();
context.setApplicationId(appIdRemoved);
ApplicationStateData appStateRemoved = ApplicationStateData.newInstance(submitTime, startTime, context, "user1");
appStateRemoved.attempts.put(attemptIdRemoved, null);
store.removeApplicationStateInternal(appStateRemoved);
try {
store.removeApplicationStateInternal(appStateRemoved);
} catch (KeeperException.NoNodeException nne) {
Assert.fail("NoNodeException should not happen.");
}
store.close();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class CapacityScheduler method updateApplicationPriority.
@Override
public Priority updateApplicationPriority(Priority newPriority, ApplicationId applicationId, SettableFuture<Object> future, UserGroupInformation user) throws YarnException {
try {
writeLock.lock();
Priority appPriority = null;
SchedulerApplication<FiCaSchedulerApp> application = applications.get(applicationId);
if (application == null) {
throw new YarnException("Application '" + applicationId + "' is not present, hence could not change priority.");
}
RMApp rmApp = rmContext.getRMApps().get(applicationId);
appPriority = checkAndGetApplicationPriority(newPriority, user, rmApp.getQueue(), applicationId);
if (application.getPriority().equals(appPriority)) {
future.set(null);
return appPriority;
}
// Update new priority in Submission Context to update to StateStore.
rmApp.getApplicationSubmissionContext().setPriority(appPriority);
// Update to state store
ApplicationStateData appState = ApplicationStateData.newInstance(rmApp.getSubmitTime(), rmApp.getStartTime(), rmApp.getApplicationSubmissionContext(), rmApp.getUser(), rmApp.getCallerContext());
appState.setApplicationTimeouts(rmApp.getApplicationTimeouts());
rmContext.getStateStore().updateApplicationStateSynchronously(appState, false, future);
// As we use iterator over a TreeSet for OrderingPolicy, once we change
// priority then reinsert back to make order correct.
LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue());
queue.updateApplicationPriority(application, appPriority);
LOG.info("Priority '" + appPriority + "' is updated in queue :" + rmApp.getQueue() + " for application: " + applicationId + " for the user: " + rmApp.getUser());
return appPriority;
} finally {
writeLock.unlock();
}
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestWorkPreservingRMRestart method testAppStateSavedButAttemptStateNotSaved.
// Test that if application state was saved, but attempt state was not saved.
// RM should start correctly.
@Test(timeout = 20000)
public void testAppStateSavedButAttemptStateNotSaved() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized void updateApplicationAttemptStateInternal(ApplicationAttemptId appAttemptId, ApplicationAttemptStateData attemptState) {
// do nothing;
// simulate the failure that attempt final state is not saved.
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
ApplicationStateData appSavedState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// check that app state is saved.
assertEquals(RMAppState.FINISHED, appSavedState.getState());
// check that attempt state is not saved.
assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState());
rm2 = new MockRM(conf, memStore);
rm2.start();
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
// check that attempt state is recovered correctly.
assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestRMRestart method testRMRestartWaitForPreviousAMToFinish.
@Test(timeout = 60000)
public void testRMRestartWaitForPreviousAMToFinish() throws Exception {
// testing 3 cases
// After RM restarts
// 1) New application attempt is not started until previous AM container
// finish event is reported back to RM as a part of nm registration.
// 2) If previous AM container finish event is never reported back (i.e.
// node manager on which this AM container was running also went down) in
// that case AMLivenessMonitor should time out previous attempt and start
// new attempt.
// 3) If all the stored attempts had finished then new attempt should
// be started immediately.
YarnConfiguration conf = new YarnConfiguration(this.conf);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 40);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
final MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
AbstractYarnScheduler ys = (AbstractYarnScheduler) rm1.getResourceScheduler();
MockNM nm1 = new MockNM("127.0.0.1:1234", 16382, rm1.getResourceTrackerService());
nm1.registerNode();
// submitting app
RMApp app1 = rm1.submitApp(200);
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am1 = launchAM(app1, rm1, nm1);
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
// Fail first AM.
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am1.getApplicationAttemptId());
// launch another AM.
MockAM am2 = launchAM(app1, rm1, nm1);
Assert.assertEquals(1, rmAppState.size());
Assert.assertEquals(app1.getState(), RMAppState.RUNNING);
Assert.assertEquals(app1.getAppAttempts().get(app1.getCurrentAppAttempt().getAppAttemptId()).getAppAttemptState(), RMAppAttemptState.RUNNING);
// start new RM.
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NodeHeartbeatResponse res = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction());
RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
// new attempt should not be started
Assert.assertEquals(2, rmApp.getAppAttempts().size());
// am1 attempt should be in FAILED state where as am2 attempt should be in
// LAUNCHED state
rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ys = (AbstractYarnScheduler) rm2.getResourceScheduler();
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am2.getApplicationAttemptId());
launchAM(rmApp, rm2, nm1);
Assert.assertEquals(3, rmApp.getAppAttempts().size());
rm2.waitForState(rmApp.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.RUNNING);
// Now restart RM ...
// Setting AMLivelinessMonitor interval to be 10 Secs.
conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 10000);
MockRM rm3 = createMockRM(conf, memStore);
rm3.start();
// Wait for RM to process all the events as a part of rm recovery.
nm1.setResourceTrackerService(rm3.getResourceTrackerService());
rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED);
// new attempt should not be started
Assert.assertEquals(3, rmApp.getAppAttempts().size());
// am1 and am2 attempts should be in FAILED state where as am3 should be
// in LAUNCHED state
rm3.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm3.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ApplicationAttemptId latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.FAILED);
rm3.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
final int maxRetry = 10;
final RMApp rmAppForCheck = rmApp;
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return new Boolean(rmAppForCheck.getAppAttempts().size() == 4);
}
}, 100, maxRetry);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
// The 4th attempt has started but is not yet saved into RMStateStore
// It will be saved only when we launch AM.
// submitting app but not starting AM for it.
RMApp app2 = rm3.submitApp(200);
rm3.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(1, app2.getAppAttempts().size());
Assert.assertEquals(0, memStore.getState().getApplicationState().get(app2.getApplicationId()).getAttemptCount());
MockRM rm4 = createMockRM(conf, memStore);
rm4.start();
rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId());
rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
// wait for the attempt to be created.
int timeoutSecs = 0;
while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
Thread.sleep(200);
}
Assert.assertEquals(4, rmApp.getAppAttempts().size());
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
// The initial application for which an AM was not started should be in
// ACCEPTED state with one application attempt started.
app2 = rm4.getRMContext().getRMApps().get(app2.getApplicationId());
rm4.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, app2.getState());
Assert.assertEquals(1, app2.getAppAttempts().size());
rm4.waitForState(app2.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2.getCurrentAppAttempt().getAppAttemptState());
}
Aggregations