use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testCapacitySchedulerQueueRemovedRecovery.
//Test behavior of an app if queue is removed during recovery. Test case does
//following:
//1. Add some apps to two queues, attempt to add an app to a non-existant
// queue to verify that the new logic is not in effect during normal app
// submission
//2. Remove one of the queues, restart the RM, once with fail fast config as
// false and once with fail fast as true.
//3. Verify that app was killed if fail fast is false.
//4. Verify that QueueException was thrown if fail fast is true.
@Test(timeout = 30000)
public void testCapacitySchedulerQueueRemovedRecovery() throws Exception {
if (getSchedulerType() != SchedulerType.CAPACITY) {
return;
}
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration(conf);
setupQueueConfiguration(csConf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(csConf);
rm1 = new MockRM(csConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
nm2.registerNode();
RMApp app1_1 = rm1.submitApp(1024, "app1_1", USER_1, null, A);
MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
RMApp app1_2 = rm1.submitApp(1024, "app1_2", USER_1, null, A);
MockAM am1_2 = MockRM.launchAndRegisterAM(app1_2, rm1, nm2);
RMApp app2 = rm1.submitApp(1024, "app2", USER_2, null, B);
MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
assertEquals(rm1.getApplicationReport(app2.getApplicationId()).getYarnApplicationState(), YarnApplicationState.RUNNING);
//Submit an app with a non existant queue to make sure it does not
//cause a fatal failure in the non-recovery case
RMApp appNA = rm1.submitApp(1024, "app1_2", USER_1, null, QUEUE_DOESNT_EXIST, false);
// clear queue metrics
rm1.clearQueueMetrics(app1_1);
rm1.clearQueueMetrics(app1_2);
rm1.clearQueueMetrics(app2);
// Take a copy of state store so that it can be reset to this state.
RMState state = memStore.loadState();
// Set new configuration with QueueB removed.
csConf = new CapacitySchedulerConfiguration(conf);
setupQueueConfigurationOnlyA(csConf);
String diags = "Application killed on recovery as it was submitted to " + "queue QueueB which no longer exists after restart.";
verifyAppRecoveryWithWrongQueueConfig(csConf, app2, diags, memStore, state);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testAMfailedBetweenRMRestart.
// Test RM shuts down, in the meanwhile, AM fails. Restarted RM scheduler
// should not recover the containers that belong to the failed AM.
@Test(timeout = 20000)
public void testAMfailedBetweenRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 0);
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
// Wait for RM to settle down on recovering containers;
Thread.sleep(3000);
YarnScheduler scheduler = rm2.getResourceScheduler();
// Previous AM failed, The failed AM should once again release the
// just-recovered containers.
assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm2.getResourceTrackerService());
NMContainerStatus previousAttemptContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 4, ContainerState.RUNNING);
nm2.registerNode(Arrays.asList(previousAttemptContainer), null);
// Wait for RM to settle down on recovering containers;
Thread.sleep(3000);
// check containers from previous failed attempt should not be recovered.
assertNull(scheduler.getRMContainer(previousAttemptContainer.getContainerId()));
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testRecoverSchedulerAppAndAttemptSynchronously.
@Test(timeout = 20000)
public void testRecoverSchedulerAppAndAttemptSynchronously() throws Exception {
// start RM
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// create app and launch the AM
RMApp app0 = rm1.submitApp(200);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// scheduler app/attempt is immediately available after RM is re-started.
Assert.assertNotNull(rm2.getResourceScheduler().getSchedulerAppInfo(am0.getApplicationAttemptId()));
// getTransferredContainers should not throw NPE.
rm2.getResourceScheduler().getTransferredContainers(am0.getApplicationAttemptId());
List<NMContainerStatus> containers = createNMContainerStatusForApp(am0);
nm1.registerNode(containers, null);
waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestAMLivelinessMonitor method testResetTimer.
@Test(timeout = 10000)
public void testResetTimer() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
UserGroupInformation.setConfiguration(conf);
conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 6000);
final ControlledClock clock = new ControlledClock();
clock.setTime(0);
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized RMState loadState() throws Exception {
clock.setTime(8000);
return super.loadState();
}
};
memStore.init(conf);
final ApplicationAttemptId attemptId = mock(ApplicationAttemptId.class);
final Dispatcher dispatcher = mock(Dispatcher.class);
final boolean[] expired = new boolean[] { false };
final AMLivelinessMonitor monitor = new AMLivelinessMonitor(dispatcher, clock) {
@Override
protected void expire(ApplicationAttemptId id) {
Assert.assertEquals(id, attemptId);
expired[0] = true;
}
};
monitor.register(attemptId);
MockRM rm = new MockRM(conf, memStore) {
@Override
protected AMLivelinessMonitor createAMLivelinessMonitor() {
return monitor;
}
};
rm.start();
// make sure that monitor has started
while (monitor.getServiceState() != Service.STATE.STARTED) {
Thread.sleep(100);
}
// expired[0] would be set to true without resetTimer
Assert.assertFalse(expired[0]);
rm.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestDelegationTokenRenewer method testRMRestartWithExpiredToken.
// 1. token is expired before app completes.
// 2. RM shutdown.
// 3. When RM recovers the app, token renewal will fail as token expired.
// RM should request a new token and sent it to NM for log-aggregation.
@Test
public void testRMRestartWithExpiredToken() throws Exception {
Configuration yarnConf = new YarnConfiguration();
yarnConf.setBoolean(YarnConfiguration.RM_PROXY_USER_PRIVILEGES_ENABLED, true);
yarnConf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
yarnConf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
yarnConf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
UserGroupInformation.setConfiguration(yarnConf);
// create Token1:
Text userText1 = new Text("user1");
DelegationTokenIdentifier dtId1 = new DelegationTokenIdentifier(userText1, new Text("renewer1"), userText1);
final Token<DelegationTokenIdentifier> originalToken = new Token<>(dtId1.getBytes(), "password1".getBytes(), dtId1.getKind(), new Text("service1"));
Credentials credentials = new Credentials();
credentials.addToken(userText1, originalToken);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(yarnConf);
MockRM rm1 = new TestSecurityMockRM(yarnConf, memStore);
rm1.start();
RMApp app = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", 1, credentials);
// create token2
Text userText2 = new Text("user1");
DelegationTokenIdentifier dtId2 = new DelegationTokenIdentifier(userText1, new Text("renewer2"), userText2);
final Token<DelegationTokenIdentifier> updatedToken = new Token<DelegationTokenIdentifier>(dtId2.getBytes(), "password2".getBytes(), dtId2.getKind(), new Text("service2"));
AtomicBoolean firstRenewInvoked = new AtomicBoolean(false);
AtomicBoolean secondRenewInvoked = new AtomicBoolean(false);
MockRM rm2 = new TestSecurityMockRM(yarnConf, memStore) {
@Override
protected DelegationTokenRenewer createDelegationTokenRenewer() {
return new DelegationTokenRenewer() {
@Override
protected void renewToken(final DelegationTokenToRenew dttr) throws IOException {
if (dttr.token.equals(updatedToken)) {
secondRenewInvoked.set(true);
super.renewToken(dttr);
} else if (dttr.token.equals(originalToken)) {
firstRenewInvoked.set(true);
throw new InvalidToken("Failed to renew");
} else {
throw new IOException("Unexpected");
}
}
@Override
protected Token<?>[] obtainSystemTokensForUser(String user, final Credentials credentials) throws IOException {
credentials.addToken(updatedToken.getService(), updatedToken);
return new Token<?>[] { updatedToken };
}
};
}
};
// simulating restart the rm
rm2.start();
// check nm can retrieve the token
final MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
nm1.registerNode();
NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
ByteBuffer tokenBuffer = response.getSystemCredentialsForApps().get(app.getApplicationId());
Assert.assertNotNull(tokenBuffer);
Credentials appCredentials = new Credentials();
DataInputByteBuffer buf = new DataInputByteBuffer();
tokenBuffer.rewind();
buf.reset(tokenBuffer);
appCredentials.readTokenStorageStream(buf);
Assert.assertTrue(firstRenewInvoked.get() && secondRenewInvoked.get());
Assert.assertTrue(appCredentials.getAllTokens().contains(updatedToken));
}
Aggregations