use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestAMRestart method testRMRestartOrFailoverNotCountedForAMFailures.
// Test regular RM restart/failover, new RM should not count
// AM failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 50000)
public void testRMRestartOrFailoverNotCountedForAMFailures() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
// AM should be restarted even though max-am-attempt is 1.
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
// Restart rm.
MockRM rm2 = new MockRM(conf, memStore);
rm2.start();
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// re-register the NM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
status.setContainerId(attempt1.getMasterContainer().getId());
status.setContainerState(ContainerState.COMPLETE);
status.setDiagnostics("");
nm1.registerNode(Collections.singletonList(status), null);
rm2.waitForState(attempt1.getAppAttemptId(), RMAppAttemptState.FAILED);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
// Will automatically start a new AppAttempt in rm2
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
RMAppAttempt attempt3 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
Assert.assertTrue(attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart.
@Test(timeout = 20000)
public void testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
// submit app with keepContainersAcrossApplicationAttempts true
Resource resource = Records.newRecord(Resource.class);
resource.setMemorySize(200);
RMApp app0 = rm1.submitApp(resource, "", UserGroupInformation.getCurrentUser().getShortUserName(), null, false, null, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS, null, null, true, true, false, null, 0, null, true, null);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
am0.allocate("127.0.0.1", 1000, 2, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
List<Container> conts = am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// am failed,and relaunch it
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
MockAM am1 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
// rm failover
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// container launched by first am completed
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 2, ContainerState.COMPLETE);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 3, ContainerState.RUNNING);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
Thread.sleep(200);
// check whether current am could get containerCompleteMsg
RMApp recoveredApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp0.getCurrentAppAttempt();
assertEquals(1, loadedAttempt1.getJustFinishedContainers().size());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testNewContainersNotAllocatedDuringSchedulerRecovery.
@Test(timeout = 20000)
public void testNewContainersNotAllocatedDuringSchedulerRecovery() throws Exception {
conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 4000);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// Restart RM
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm1.registerNode();
ControlledClock clock = new ControlledClock();
long startTime = System.currentTimeMillis();
((RMContextImpl) rm2.getRMContext()).setSystemClock(clock);
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
am1.registerAppAttempt(true);
rm2.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
// AM request for new containers
am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
List<Container> containers = new ArrayList<Container>();
clock.setTime(startTime + 2000);
nm1.nodeHeartbeat(true);
// sleep some time as allocation happens asynchronously.
Thread.sleep(3000);
containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
// container is not allocated during scheduling recovery.
Assert.assertTrue(containers.isEmpty());
clock.setTime(startTime + 8000);
nm1.nodeHeartbeat(true);
// Container is created after recovery is done.
while (containers.isEmpty()) {
containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testContainersNotRecoveredForCompletedApps.
// Apps already completed before RM restart. Restarted RM scheduler should not
// recover containers for completed apps.
@Test(timeout = 20000)
public void testContainersNotRecoveredForCompletedApps() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(runningContainer, completedContainer), null);
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
// Wait for RM to settle down on recovering containers;
Thread.sleep(3000);
YarnScheduler scheduler = rm2.getResourceScheduler();
// scheduler should not recover containers for finished apps.
assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMDelegationTokens method testRMDTMasterKeyStateOnRollingMasterKey.
// Test the DT mast key in the state-store when the mast key is being rolled.
@Test(timeout = 15000)
public void testRMDTMasterKeyStateOnRollingMasterKey() throws Exception {
Configuration conf = new Configuration(testConf);
conf.set("hadoop.security.authentication", "kerberos");
UserGroupInformation.setConfiguration(conf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<RMDelegationTokenIdentifier, Long> rmDTState = rmState.getRMDTSecretManagerState().getTokenState();
Set<DelegationKey> rmDTMasterKeyState = rmState.getRMDTSecretManagerState().getMasterKeyState();
MockRM rm1 = new MyMockRM(conf, memStore);
rm1.start();
// on rm start, two master keys are created.
// One is created at RMDTSecretMgr.startThreads.updateCurrentKey();
// the other is created on the first run of
// tokenRemoverThread.rollMasterKey()
RMDelegationTokenSecretManager dtSecretManager = rm1.getRMContext().getRMDelegationTokenSecretManager();
// assert all master keys are saved
Assert.assertEquals(dtSecretManager.getAllMasterKeys(), rmDTMasterKeyState);
// request to generate a RMDelegationToken
GetDelegationTokenRequest request = mock(GetDelegationTokenRequest.class);
when(request.getRenewer()).thenReturn("renewer1");
GetDelegationTokenResponse response = rm1.getClientRMService().getDelegationToken(request);
org.apache.hadoop.yarn.api.records.Token delegationToken = response.getRMDelegationToken();
Token<RMDelegationTokenIdentifier> token1 = ConverterUtils.convertFromYarn(delegationToken, (Text) null);
RMDelegationTokenIdentifier dtId1 = token1.decodeIdentifier();
// in state-store also.
while (((TestRMDelegationTokenSecretManager) dtSecretManager).numUpdatedKeys.get() < 3) {
((TestRMDelegationTokenSecretManager) dtSecretManager).checkCurrentKeyInStateStore(rmDTMasterKeyState);
Thread.sleep(100);
}
// wait for token to expire and remove from state-store
// rollMasterKey is called every 1 second.
int count = 0;
while (rmDTState.containsKey(dtId1) && count < 100) {
Thread.sleep(100);
count++;
}
rm1.stop();
}
Aggregations