use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testAMContainerStatusWithRMRestart.
@Test(timeout = 30000)
public void testAMContainerStatusWithRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1_1 = rm1.submitApp(1024);
MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
RMAppAttempt attempt0 = app1_1.getCurrentAppAttempt();
YarnScheduler scheduler = rm1.getResourceScheduler();
Assert.assertTrue(scheduler.getRMContainer(attempt0.getMasterContainer().getId()).isAMContainer());
// Re-start RM
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
List<NMContainerStatus> am1_1Containers = createNMContainerStatusForApp(am1_1);
nm1.registerNode(am1_1Containers, null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am1_1.getApplicationAttemptId());
scheduler = rm2.getResourceScheduler();
Assert.assertTrue(scheduler.getRMContainer(attempt0.getMasterContainer().getId()).isAMContainer());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testAppStateSavedButAttemptStateNotSaved.
// Test that if application state was saved, but attempt state was not saved.
// RM should start correctly.
@Test(timeout = 20000)
public void testAppStateSavedButAttemptStateNotSaved() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized void updateApplicationAttemptStateInternal(ApplicationAttemptId appAttemptId, ApplicationAttemptStateData attemptState) {
// do nothing;
// simulate the failure that attempt final state is not saved.
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
ApplicationStateData appSavedState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// check that app state is saved.
assertEquals(RMAppState.FINISHED, appSavedState.getState());
// check that attempt state is not saved.
assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState());
rm2 = new MockRM(conf, memStore);
rm2.start();
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
// check that attempt state is recovered correctly.
assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testReleasedContainerNotRecovered.
// Test if RM on recovery receives the container release request from AM
// before it receives the container status reported by NM for recovery. this
// container should not be recovered.
@Test(timeout = 50000)
public void testReleasedContainerNotRecovered() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
rm1.start();
RMApp app1 = rm1.submitApp(1024);
final MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// Re-start RM
conf.setInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, 8000);
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
am1.registerAppAttempt(true);
// try to release a container before the container is actually recovered.
final ContainerId runningContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 2);
am1.allocate(null, Arrays.asList(runningContainer));
// send container statuses to recover the containers
List<NMContainerStatus> containerStatuses = createNMContainerStatusForApp(am1);
nm1.registerNode(containerStatuses, null);
// only the am container should be recovered.
waitForNumContainersToRecover(1, rm2, am1.getApplicationAttemptId());
final AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
// cached release request is cleaned.
// assertFalse(scheduler.getPendingRelease().contains(runningContainer));
AllocateResponse response = am1.allocate(null, null);
// AM gets notified of the completed container.
boolean receivedCompletedContainer = false;
for (ContainerStatus status : response.getCompletedContainersStatuses()) {
if (status.getContainerId().equals(runningContainer)) {
receivedCompletedContainer = true;
}
}
assertTrue(receivedCompletedContainer);
GenericTestUtils.waitFor(new Supplier<Boolean>() {
public Boolean get() {
// recovered
return scheduler.getApplicationAttempt(am1.getApplicationAttemptId()).getPendingRelease().isEmpty() && scheduler.getRMContainer(runningContainer) == null;
}
}, 1000, 20000);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestWorkPreservingRMRestart method testCapacitySchedulerRecovery.
// Test CS recovery with multi-level queues and multi-users:
// 1. setup 2 NMs each with 8GB memory;
// 2. setup 2 level queues: Default -> (QueueA, QueueB)
// 3. User1 submits 2 apps on QueueA
// 4. User2 submits 1 app on QueueB
// 5. AM and each container has 1GB memory
// 6. Restart RM.
// 7. nm1 re-syncs back containers belong to user1
// 8. nm2 re-syncs back containers belong to user2.
// 9. Assert the parent queue and 2 leaf queues state and the metrics.
// 10. Assert each user's consumption inside the queue.
@Test(timeout = 30000)
public void testCapacitySchedulerRecovery() throws Exception {
if (getSchedulerType() != SchedulerType.CAPACITY) {
return;
}
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration(conf);
setupQueueConfiguration(csConf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(csConf);
rm1 = new MockRM(csConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
nm2.registerNode();
RMApp app1_1 = rm1.submitApp(1024, "app1_1", USER_1, null, A);
MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
RMApp app1_2 = rm1.submitApp(1024, "app1_2", USER_1, null, A);
MockAM am1_2 = MockRM.launchAndRegisterAM(app1_2, rm1, nm2);
RMApp app2 = rm1.submitApp(1024, "app2", USER_2, null, B);
MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
// clear queue metrics
rm1.clearQueueMetrics(app1_1);
rm1.clearQueueMetrics(app1_2);
rm1.clearQueueMetrics(app2);
csConf.set(PREFIX + "root.Default.QueueB.state", "STOPPED");
// Re-start RM
rm2 = new MockRM(csConf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm2.setResourceTrackerService(rm2.getResourceTrackerService());
List<NMContainerStatus> am1_1Containers = createNMContainerStatusForApp(am1_1);
List<NMContainerStatus> am1_2Containers = createNMContainerStatusForApp(am1_2);
am1_1Containers.addAll(am1_2Containers);
nm1.registerNode(am1_1Containers, null);
List<NMContainerStatus> am2Containers = createNMContainerStatusForApp(am2);
nm2.registerNode(am2Containers, null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am1_1.getApplicationAttemptId());
waitForNumContainersToRecover(2, rm2, am1_2.getApplicationAttemptId());
waitForNumContainersToRecover(2, rm2, am2.getApplicationAttemptId());
// Calculate each queue's resource usage.
Resource containerResource = Resource.newInstance(1024, 1);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
Resource clusterResource = Resources.multiply(nmResource, 2);
Resource q1Resource = Resources.multiply(clusterResource, 0.5);
Resource q2Resource = Resources.multiply(clusterResource, 0.5);
Resource q1UsedResource = Resources.multiply(containerResource, 4);
Resource q2UsedResource = Resources.multiply(containerResource, 2);
Resource totalUsedResource = Resources.add(q1UsedResource, q2UsedResource);
Resource q1availableResources = Resources.subtract(q1Resource, q1UsedResource);
Resource q2availableResources = Resources.subtract(q2Resource, q2UsedResource);
Resource totalAvailableResource = Resources.add(q1availableResources, q2availableResources);
Map<ApplicationId, SchedulerApplication> schedulerApps = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication schedulerApp1_1 = schedulerApps.get(app1_1.getApplicationId());
// assert queue A state.
checkCSLeafQueue(rm2, schedulerApp1_1, clusterResource, q1Resource, q1UsedResource, 4);
QueueMetrics queue1Metrics = schedulerApp1_1.getQueue().getMetrics();
assertMetrics(queue1Metrics, 2, 0, 2, 0, 4, q1availableResources.getMemorySize(), q1availableResources.getVirtualCores(), q1UsedResource.getMemorySize(), q1UsedResource.getVirtualCores());
// assert queue B state.
SchedulerApplication schedulerApp2 = schedulerApps.get(app2.getApplicationId());
checkCSLeafQueue(rm2, schedulerApp2, clusterResource, q2Resource, q2UsedResource, 2);
QueueMetrics queue2Metrics = schedulerApp2.getQueue().getMetrics();
assertMetrics(queue2Metrics, 1, 0, 1, 0, 2, q2availableResources.getMemorySize(), q2availableResources.getVirtualCores(), q2UsedResource.getMemorySize(), q2UsedResource.getVirtualCores());
// assert parent queue state.
LeafQueue leafQueue = (LeafQueue) schedulerApp2.getQueue();
ParentQueue parentQueue = (ParentQueue) leafQueue.getParent();
checkParentQueue(parentQueue, 6, totalUsedResource, (float) 6 / 16, (float) 6 / 16);
assertMetrics(parentQueue.getMetrics(), 3, 0, 3, 0, 6, totalAvailableResource.getMemorySize(), totalAvailableResource.getVirtualCores(), totalUsedResource.getMemorySize(), totalUsedResource.getVirtualCores());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testAppFailedOnSubmissionSavedInStateStore.
// Test Application that fails on submission is saved in state store.
@Test(timeout = 20000)
public void testAppFailedOnSubmissionSavedInStateStore() throws Exception {
conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
UserGroupInformation.setConfiguration(conf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new TestSecurityMockRM(conf, memStore) {
class TestDelegationTokenRenewer extends DelegationTokenRenewer {
public void addApplicationAsync(ApplicationId applicationId, Credentials ts, boolean shouldCancelAtEnd, String user, Configuration appConf) {
throw new RuntimeException("failed to submit app");
}
}
@Override
protected DelegationTokenRenewer createDelegationTokenRenewer() {
return new TestDelegationTokenRenewer();
}
};
rm1.start();
RMApp app1 = null;
try {
app1 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", false);
Assert.fail();
} catch (Exception e) {
}
app1 = rm1.getRMContext().getRMApps().values().iterator().next();
rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
// Check app staet is saved in state store.
Assert.assertEquals(RMAppState.FAILED, memStore.getState().getApplicationState().get(app1.getApplicationId()).getState());
MockRM rm2 = new TestSecurityMockRM(conf, memStore);
rm2.start();
// Restarted RM has the failed app info too.
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
}
Aggregations