use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestAMRMClientOnRMRestart method testAMRMClientForUnregisterAMOnRMRestart.
// Test verify for
// 1. AM try to unregister without registering
// 2. AM register to RM, and try to unregister immediately after RM restart
@Test(timeout = 60000)
public void testAMRMClientForUnregisterAMOnRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// Phase-1 Start 1st RM
MyResourceManager rm1 = new MyResourceManager(conf, memStore);
rm1.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
// Submit the application
RMApp app = rm1.submitApp(1024);
dispatcher.await();
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm1.sendAMLaunched(appAttemptId);
dispatcher.await();
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
ugi.addTokenIdentifier(token.decodeIdentifier());
AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
amClient.init(conf);
amClient.start();
amClient.registerApplicationMaster("h1", 10000, "");
amClient.allocate(0.1f);
// Phase-2 start 2nd RM is up
MyResourceManager rm2 = new MyResourceManager(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
ContainerId containerId = ContainerId.newContainerId(appAttemptId, 1);
NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
nm1.registerNode(Arrays.asList(containerReport), null);
nm1.nodeHeartbeat(true);
dispatcher.await();
amClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHING);
nm1.nodeHeartbeat(appAttemptId, 1, ContainerState.COMPLETE);
rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHED);
rm2.waitForState(app.getApplicationId(), RMAppState.FINISHED);
amClient.stop();
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartRecoveringNodeLabelManager.
// Test does following verification
// 1. Start RM1 with store patch /tmp
// 2. Add/remove/replace labels to cluster and node lable and verify
// 3. Start RM2 with store patch /tmp only
// 4. Get cluster and node lobel, it should be present by recovering it
@Test(timeout = 20000)
public void testRMRestartRecoveringNodeLabelManager() throws Exception {
// Initial FS node label store root dir to a random tmp dir
File nodeLabelFsStoreDir = new File("target", this.getClass().getSimpleName() + "-testRMRestartRecoveringNodeLabelManager");
if (nodeLabelFsStoreDir.exists()) {
FileUtils.deleteDirectory(nodeLabelFsStoreDir);
}
nodeLabelFsStoreDir.deleteOnExit();
String nodeLabelFsStoreDirURI = nodeLabelFsStoreDir.toURI().toString();
conf.set(YarnConfiguration.FS_NODE_LABELS_STORE_ROOT_DIR, nodeLabelFsStoreDirURI);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
rm1.init(conf);
rm1.start();
RMNodeLabelsManager nodeLabelManager = rm1.getRMContext().getNodeLabelManager();
Set<String> clusterNodeLabels = new HashSet<String>();
clusterNodeLabels.add("x");
clusterNodeLabels.add("y");
clusterNodeLabels.add("z");
// Add node label x,y,z
nodeLabelManager.addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
// Add node Label to Node h1->x
NodeId n1 = NodeId.newInstance("h1", 0);
nodeLabelManager.addLabelsToNode(ImmutableMap.of(n1, toSet("x")));
clusterNodeLabels.remove("z");
// Remove cluster label z
nodeLabelManager.removeFromClusterNodeLabels(toSet("z"));
// Replace nodelabel h1->x,y
nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(n1, toSet("y")));
// Wait for updating store.It is expected NodeStore update should happen
// very fast since it has separate dispatcher. So waiting for max 5 seconds,
// which is sufficient time to update NodeStore.
int count = 10;
while (count-- > 0) {
if (nodeLabelManager.getNodeLabels().size() > 0) {
break;
}
Thread.sleep(500);
}
Assert.assertEquals(clusterNodeLabels.size(), nodeLabelManager.getClusterNodeLabelNames().size());
Map<NodeId, Set<String>> nodeLabels = nodeLabelManager.getNodeLabels();
Assert.assertEquals(1, nodeLabelManager.getNodeLabels().size());
Assert.assertTrue(nodeLabels.get(n1).equals(toSet("y")));
MockRM rm2 = new MockRM(conf, memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
rm2.init(conf);
rm2.start();
nodeLabelManager = rm2.getRMContext().getNodeLabelManager();
Assert.assertEquals(clusterNodeLabels.size(), nodeLabelManager.getClusterNodeLabelNames().size());
nodeLabels = nodeLabelManager.getNodeLabels();
Assert.assertEquals(1, nodeLabelManager.getNodeLabels().size());
Assert.assertTrue(nodeLabels.get(n1).equals(toSet("y")));
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartAppRunningAMFailed.
@Test(timeout = 60000)
public void testRMRestartAppRunningAMFailed() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// create app and launch the AM
RMApp app0 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", true, true);
MockAM am0 = launchAM(app0, rm1, nm1);
// fail the AM by sending CONTAINER_FINISHED event without registering.
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ApplicationStateData appState = rmAppState.get(app0.getApplicationId());
// assert the AM failed state is saved.
Assert.assertEquals(RMAppAttemptState.FAILED, appState.getAttempt(am0.getApplicationAttemptId()).getState());
// assert app state has not been saved.
Assert.assertNull(rmAppState.get(app0.getApplicationId()).getState());
// new AM started but not registered, app still stays at ACCECPTED state.
rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
// start new RM
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
// assert the previous AM state is loaded back on RM recovery.
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testQueueMetricsOnRMRestart.
@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testQueueMetricsOnRMRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// PHASE 1: create state in an RM
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
QueueMetrics qm1 = rm1.getResourceScheduler().getRootQueueMetrics();
resetQueueMetrics(qm1);
assertQueueMetrics(qm1, 0, 0, 0, 0);
// create app that gets launched and does allocate before RM restart
RMApp app1 = rm1.submitApp(200);
// Need to wait first for AppAttempt to be started (RMAppState.ACCEPTED)
// and then for it to reach RMAppAttemptState.SCHEDULED
// inorder to ensure appsPending metric is incremented
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
rm1.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
assertQueueMetrics(qm1, 1, 1, 0, 0);
nm1.nodeHeartbeat(true);
rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
assertQueueMetrics(qm1, 1, 0, 1, 0);
// PHASE 2: create new RM and start from old state
// create new RM to represent restart and recover state
MockRM rm2 = createMockRM(conf, memStore);
QueueMetrics qm2 = rm2.getResourceScheduler().getRootQueueMetrics();
resetQueueMetrics(qm2);
assertQueueMetrics(qm2, 0, 0, 0, 0);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// recover app
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
nm1.nodeHeartbeat(true);
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
while (loadedApp1.getAppAttempts().size() != 2) {
Thread.sleep(200);
}
attempt1 = loadedApp1.getCurrentAppAttempt();
attemptId1 = attempt1.getAppAttemptId();
rm2.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
assertQueueMetrics(qm2, 1, 1, 0, 0);
nm1.nodeHeartbeat(true);
rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
assertQueueMetrics(qm2, 1, 0, 1, 0);
am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// finish the AMs
finishApplicationMaster(loadedApp1, rm2, nm1, am1);
// now AppAttempt and App becomes FINISHED,
// we should also grant APP_ATTEMPT_REMOVE/APP_REMOVE event
// had processed by scheduler
rm2.waitForAppRemovedFromScheduler(loadedApp1.getApplicationId());
assertQueueMetrics(qm2, 1, 0, 0, 1);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testClientRetryOnKillingApplication.
// This is to test Killing application should be able to wait until app
// reaches killed state and also check that attempt state is saved before app
// state is saved.
@Test(timeout = 60000)
public void testClientRetryOnKillingApplication() throws Exception {
MemoryRMStateStore memStore = new TestMemoryRMStateStore();
memStore.init(conf);
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200, "name", "user", null, false, "default", 1, null, "myType");
MockAM am1 = launchAM(app1, rm1, nm1);
KillApplicationResponse response;
int count = 0;
while (true) {
response = rm1.killApp(app1.getApplicationId());
if (response.getIsKillCompleted()) {
break;
}
Thread.sleep(100);
count++;
}
// we expect at least 2 calls for killApp as the first killApp always return
// false.
Assert.assertTrue(count >= 1);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.KILLED);
rm1.waitForState(app1.getApplicationId(), RMAppState.KILLED);
Assert.assertEquals(1, ((TestMemoryRMStateStore) memStore).updateAttempt);
Assert.assertEquals(2, ((TestMemoryRMStateStore) memStore).updateApp);
}
Aggregations