use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestRMRestart method testAppRecoveredInOrderOnRMRestart.
@Test(timeout = 20000)
public void testAppRecoveredInOrderOnRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
for (int i = 10; i > 0; i--) {
ApplicationStateData appState = mock(ApplicationStateData.class);
ApplicationSubmissionContext context = mock(ApplicationSubmissionContext.class);
when(appState.getApplicationSubmissionContext()).thenReturn(context);
when(context.getApplicationId()).thenReturn(ApplicationId.newInstance(1234, i));
memStore.getState().getApplicationState().put(appState.getApplicationSubmissionContext().getApplicationId(), appState);
}
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected RMAppManager createRMAppManager() {
return new TestRMAppManager(this.rmContext, this.scheduler, this.masterService, this.applicationACLsManager, conf);
}
class TestRMAppManager extends RMAppManager {
ApplicationId prevId = ApplicationId.newInstance(1234, 0);
public TestRMAppManager(RMContext context, YarnScheduler scheduler, ApplicationMasterService masterService, ApplicationACLsManager applicationACLsManager, Configuration conf) {
super(context, scheduler, masterService, applicationACLsManager, conf);
}
@Override
protected void recoverApplication(ApplicationStateData appState, RMState rmState) throws Exception {
// check application is recovered in order.
Assert.assertTrue(rmState.getApplicationState().size() > 0);
Assert.assertTrue(appState.getApplicationSubmissionContext().getApplicationId().compareTo(prevId) > 0);
prevId = appState.getApplicationSubmissionContext().getApplicationId();
}
}
};
try {
rm1.start();
} finally {
rm1.stop();
}
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestRMRestart method testRMRestartFailedApp.
@Test(timeout = 60000)
public void testRMRestartFailedApp() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// create app and launch the AM
RMApp app0 = rm1.submitApp(200);
MockAM am0 = launchAM(app0, rm1, nm1);
// fail the AM by sending CONTAINER_FINISHED event without registering.
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm1.waitForState(app0.getApplicationId(), RMAppState.FAILED);
// assert the app/attempt failed state is saved.
ApplicationStateData appState = rmAppState.get(app0.getApplicationId());
Assert.assertEquals(RMAppState.FAILED, appState.getState());
Assert.assertEquals(RMAppAttemptState.FAILED, appState.getAttempt(am0.getApplicationAttemptId()).getState());
// start new RM
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
RMApp loadedApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
rm2.waitForState(app0.getApplicationId(), RMAppState.FAILED);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
// no new attempt is created.
Assert.assertEquals(1, loadedApp0.getAppAttempts().size());
verifyAppReportAfterRMRestart(app0, rm2);
Assert.assertTrue(app0.getDiagnostics().toString().contains("Failing the application."));
// failed diagnostics from attempt is lost because the diagnostics from
// attempt is not yet available by the time app is saving the app state.
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestApplicationLifetimeMonitor method testUpdateApplicationTimeoutForStateStoreUpdateFail.
@Test(timeout = 60000)
public void testUpdateApplicationTimeoutForStateStoreUpdateFail() throws Exception {
MockRM rm1 = null;
try {
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
MemoryRMStateStore memStore = new MemoryRMStateStore() {
private int count = 0;
@Override
public synchronized void updateApplicationStateInternal(ApplicationId appId, ApplicationStateData appState) throws Exception {
// fail only 1 time.
if (count++ == 0) {
throw new Exception("State-store update failed");
}
super.updateApplicationStateInternal(appId, appState);
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
nm1.nodeHeartbeat(true);
long appLifetime = 30L;
Map<ApplicationTimeoutType, Long> timeouts = new HashMap<ApplicationTimeoutType, Long>();
timeouts.put(ApplicationTimeoutType.LIFETIME, appLifetime);
RMApp app1 = rm1.submitApp(200, Priority.newInstance(0), timeouts);
Map<ApplicationTimeoutType, String> updateTimeout = new HashMap<ApplicationTimeoutType, String>();
long newLifetime = 10L;
// update 10L seconds more to timeout i.e 30L seconds overall
updateTimeout.put(ApplicationTimeoutType.LIFETIME, Times.formatISO8601(System.currentTimeMillis() + newLifetime * 1000));
UpdateApplicationTimeoutsRequest request = UpdateApplicationTimeoutsRequest.newInstance(app1.getApplicationId(), updateTimeout);
Map<ApplicationTimeoutType, Long> applicationTimeouts = app1.getApplicationTimeouts();
// has old timeout time
long beforeUpdate = applicationTimeouts.get(ApplicationTimeoutType.LIFETIME);
try {
// update app2 lifetime to new time i.e now + timeout
rm1.getRMContext().getClientRMService().updateApplicationTimeouts(request);
fail("Update application should fail.");
} catch (YarnException e) {
// expected
assertTrue("State-store exception does not containe appId", e.getMessage().contains(app1.getApplicationId().toString()));
}
applicationTimeouts = app1.getApplicationTimeouts();
// has old timeout time
long afterUpdate = applicationTimeouts.get(ApplicationTimeoutType.LIFETIME);
Assert.assertEquals("Application timeout is updated", beforeUpdate, afterUpdate);
rm1.waitForState(app1.getApplicationId(), RMAppState.KILLED);
// verify for app killed with updated lifetime
Assert.assertTrue("Application killed before lifetime value", app1.getFinishTime() > afterUpdate);
} finally {
stopRM(rm1);
}
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestLeaderElectorService method testStateStoreFailureCauseFailover.
// 1. rm1 active
// 2. rm2 standby
// 3. submit a job to rm1 which triggers state-store failure.
// 4. rm2 become
@Test
public void testStateStoreFailureCauseFailover() throws Exception {
conf.set(YarnConfiguration.RM_HA_ID, "rm1");
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized void storeApplicationStateInternal(ApplicationId appId, ApplicationStateData appState) throws Exception {
throw new Exception("store app failure.");
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore, true);
rm1.init(conf);
rm1.start();
waitFor(rm1, HAServiceState.ACTIVE);
rm2 = startRM("rm2", HAServiceState.STANDBY);
// submit an app which will trigger state-store failure.
rm1.submitApp(200, "app1", "user1", null, "default", false);
waitFor(rm1, HAServiceState.STANDBY);
// rm2 should become active;
waitFor(rm2, HAServiceState.ACTIVE);
rm2.stop();
// rm1 will become active again
waitFor(rm1, HAServiceState.ACTIVE);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.
the class TestRMHA method testTransitionedToStandbyShouldNotHang.
@Test
public void testTransitionedToStandbyShouldNotHang() throws Exception {
configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false);
Configuration conf = new YarnConfiguration(configuration);
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public void updateApplicationState(ApplicationStateData appState) {
notifyStoreOperationFailed(new StoreFencedException());
}
};
memStore.init(conf);
rm = new MockRM(conf, memStore) {
@Override
void stopActiveServices() {
try {
Thread.sleep(10000);
} catch (Exception e) {
throw new RuntimeException(e);
}
super.stopActiveServices();
}
};
rm.init(conf);
final StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(HAServiceProtocol.RequestSource.REQUEST_BY_USER);
assertEquals(STATE_ERR, HAServiceState.INITIALIZING, rm.adminService.getServiceStatus().getState());
assertFalse("RM is ready to become active before being started", rm.adminService.getServiceStatus().isReadyToBecomeActive());
checkMonitorHealth();
rm.start();
checkMonitorHealth();
checkStandbyRMFunctionality();
// 2. Transition to Active.
rm.adminService.transitionToActive(requestInfo);
// 3. Try Transition to standby
Thread t = new Thread(new Runnable() {
@Override
public void run() {
try {
rm.transitionToStandby(true);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
});
t.start();
rm.getRMContext().getStateStore().updateApplicationState(null);
// wait for thread to finish
t.join();
rm.adminService.transitionToStandby(requestInfo);
checkStandbyRMFunctionality();
rm.stop();
}
Aggregations