use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartWaitForPreviousAMToFinish.
@Test(timeout = 60000)
public void testRMRestartWaitForPreviousAMToFinish() throws Exception {
// testing 3 cases
// After RM restarts
// 1) New application attempt is not started until previous AM container
// finish event is reported back to RM as a part of nm registration.
// 2) If previous AM container finish event is never reported back (i.e.
// node manager on which this AM container was running also went down) in
// that case AMLivenessMonitor should time out previous attempt and start
// new attempt.
// 3) If all the stored attempts had finished then new attempt should
// be started immediately.
YarnConfiguration conf = new YarnConfiguration(this.conf);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 40);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
final MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
AbstractYarnScheduler ys = (AbstractYarnScheduler) rm1.getResourceScheduler();
MockNM nm1 = new MockNM("127.0.0.1:1234", 16382, rm1.getResourceTrackerService());
nm1.registerNode();
// submitting app
RMApp app1 = rm1.submitApp(200);
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am1 = launchAM(app1, rm1, nm1);
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
// Fail first AM.
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am1.getApplicationAttemptId());
// launch another AM.
MockAM am2 = launchAM(app1, rm1, nm1);
Assert.assertEquals(1, rmAppState.size());
Assert.assertEquals(app1.getState(), RMAppState.RUNNING);
Assert.assertEquals(app1.getAppAttempts().get(app1.getCurrentAppAttempt().getAppAttemptId()).getAppAttemptState(), RMAppAttemptState.RUNNING);
// start new RM.
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NodeHeartbeatResponse res = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction());
RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
// new attempt should not be started
Assert.assertEquals(2, rmApp.getAppAttempts().size());
// am1 attempt should be in FAILED state where as am2 attempt should be in
// LAUNCHED state
rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ys = (AbstractYarnScheduler) rm2.getResourceScheduler();
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am2.getApplicationAttemptId());
launchAM(rmApp, rm2, nm1);
Assert.assertEquals(3, rmApp.getAppAttempts().size());
rm2.waitForState(rmApp.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.RUNNING);
// Now restart RM ...
// Setting AMLivelinessMonitor interval to be 10 Secs.
conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 10000);
MockRM rm3 = createMockRM(conf, memStore);
rm3.start();
// Wait for RM to process all the events as a part of rm recovery.
nm1.setResourceTrackerService(rm3.getResourceTrackerService());
rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED);
// new attempt should not be started
Assert.assertEquals(3, rmApp.getAppAttempts().size());
// am1 and am2 attempts should be in FAILED state where as am3 should be
// in LAUNCHED state
rm3.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm3.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ApplicationAttemptId latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.FAILED);
rm3.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
final int maxRetry = 10;
final RMApp rmAppForCheck = rmApp;
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return new Boolean(rmAppForCheck.getAppAttempts().size() == 4);
}
}, 100, maxRetry);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
// The 4th attempt has started but is not yet saved into RMStateStore
// It will be saved only when we launch AM.
// submitting app but not starting AM for it.
RMApp app2 = rm3.submitApp(200);
rm3.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(1, app2.getAppAttempts().size());
Assert.assertEquals(0, memStore.getState().getApplicationState().get(app2.getApplicationId()).getAttemptCount());
MockRM rm4 = createMockRM(conf, memStore);
rm4.start();
rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId());
rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
// wait for the attempt to be created.
int timeoutSecs = 0;
while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
Thread.sleep(200);
}
Assert.assertEquals(4, rmApp.getAppAttempts().size());
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
// The initial application for which an AM was not started should be in
// ACCEPTED state with one application attempt started.
app2 = rm4.getRMContext().getRMApps().get(app2.getApplicationId());
rm4.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, app2.getState());
Assert.assertEquals(1, app2.getAppAttempts().size());
rm4.waitForState(app2.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2.getCurrentAppAttempt().getAppAttemptState());
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartKilledAppWithNoAttempts.
@Test(timeout = 60000)
public void testRMRestartKilledAppWithNoAttempts() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized void storeApplicationAttemptStateInternal(ApplicationAttemptId attemptId, ApplicationAttemptStateData attemptStateData) throws Exception {
// ignore attempt saving request.
}
@Override
public synchronized void updateApplicationAttemptStateInternal(ApplicationAttemptId attemptId, ApplicationAttemptStateData attemptStateData) throws Exception {
// ignore attempt saving request.
}
};
memStore.init(conf);
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
// create app
RMApp app0 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", false);
// kill the app.
rm1.killApp(app0.getApplicationId());
rm1.waitForState(app0.getApplicationId(), RMAppState.KILLED);
// restart rm
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
RMApp loadedApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
rm2.waitForState(loadedApp0.getApplicationId(), RMAppState.KILLED);
Assert.assertTrue(loadedApp0.getAppAttempts().size() == 0);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartGetApplicationList.
@Test(timeout = 60000)
public void testRMRestartGetApplicationList() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// start RM
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected SystemMetricsPublisher createSystemMetricsPublisher() {
return spy(super.createSystemMetricsPublisher());
}
};
rms.add(rm1);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// a succeeded app.
RMApp app0 = rm1.submitApp(200, "name", "user", null, false, "default", 1, null, "myType");
MockAM am0 = launchAM(app0, rm1, nm1);
finishApplicationMaster(app0, rm1, nm1, am0);
// a failed app.
RMApp app1 = rm1.submitApp(200, "name", "user", null, false, "default", 1, null, "myType");
MockAM am1 = launchAM(app1, rm1, nm1);
// fail the AM by sending CONTAINER_FINISHED event without registering.
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
// a killed app.
RMApp app2 = rm1.submitApp(200, "name", "user", null, false, "default", 1, null, "myType");
MockAM am2 = launchAM(app2, rm1, nm1);
rm1.killApp(app2.getApplicationId());
rm1.waitForState(app2.getApplicationId(), RMAppState.KILLED);
rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.KILLED);
verify(rm1.getRMContext().getSystemMetricsPublisher(), Mockito.times(3)).appCreated(any(RMApp.class), anyLong());
// restart rm
MockRM rm2 = new MockRM(conf, memStore) {
@Override
protected RMAppManager createRMAppManager() {
return spy(super.createRMAppManager());
}
@Override
protected SystemMetricsPublisher createSystemMetricsPublisher() {
return spy(super.createSystemMetricsPublisher());
}
};
rms.add(rm2);
rm2.start();
verify(rm2.getRMContext().getSystemMetricsPublisher(), Mockito.times(3)).appCreated(any(RMApp.class), anyLong());
GetApplicationsRequest request1 = GetApplicationsRequest.newInstance(EnumSet.of(YarnApplicationState.FINISHED, YarnApplicationState.KILLED, YarnApplicationState.FAILED));
GetApplicationsResponse response1 = rm2.getClientRMService().getApplications(request1);
List<ApplicationReport> appList1 = response1.getApplicationList();
// assert all applications exist according to application state after RM
// restarts.
boolean forApp0 = false, forApp1 = false, forApp2 = false;
for (ApplicationReport report : appList1) {
if (report.getApplicationId().equals(app0.getApplicationId())) {
Assert.assertEquals(YarnApplicationState.FINISHED, report.getYarnApplicationState());
forApp0 = true;
}
if (report.getApplicationId().equals(app1.getApplicationId())) {
Assert.assertEquals(YarnApplicationState.FAILED, report.getYarnApplicationState());
forApp1 = true;
}
if (report.getApplicationId().equals(app2.getApplicationId())) {
Assert.assertEquals(YarnApplicationState.KILLED, report.getYarnApplicationState());
forApp2 = true;
}
}
Assert.assertTrue(forApp0 && forApp1 && forApp2);
// assert all applications exist according to application type after RM
// restarts.
Set<String> appTypes = new HashSet<String>();
appTypes.add("myType");
GetApplicationsRequest request2 = GetApplicationsRequest.newInstance(appTypes);
GetApplicationsResponse response2 = rm2.getClientRMService().getApplications(request2);
List<ApplicationReport> appList2 = response2.getApplicationList();
Assert.assertTrue(3 == appList2.size());
// check application summary is logged for the completed apps with timeout
// to make sure APP_COMPLETED events are processed, after RM restart.
verify(rm2.getRMAppManager(), timeout(1000).times(3)).logApplicationSummary(isA(ApplicationId.class));
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartAfterNodeLabelDisabled.
@Test(timeout = 60000)
public void testRMRestartAfterNodeLabelDisabled() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
MockRM rm1 = new MockRM(TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
rm1.start();
// add node label "x" and set node to label mapping
Set<String> clusterNodeLabels = new HashSet<String>();
clusterNodeLabels.add("x");
RMNodeLabelsManager nodeLabelManager = rm1.getRMContext().getNodeLabelManager();
nodeLabelManager.addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
nodeLabelManager.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x")));
// label = x
MockNM nm1 = rm1.registerNode("h1:1234", 8000);
// submit an application with specifying am node label expression as "x"
RMApp app1 = rm1.submitApp(200, "someApp", "someUser", null, "a1", "x");
// check am container allocated with correct node label expression
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
ContainerId amContainerId1 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
Assert.assertEquals("x", rm1.getRMContext().getScheduler().getRMContainer(amContainerId1).getNodeLabelExpression());
finishApplicationMaster(app1, rm1, nm1, am1);
// restart rm with node label disabled
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, false);
MockRM rm2 = new MockRM(TestUtils.getConfigurationWithDefaultQueueLabels(conf), memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
// node label expression.
try {
rm2.start();
Assert.assertTrue("RM start successfully", true);
Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
} catch (Exception e) {
LOG.debug("Exception on start", e);
Assert.fail("RM should start without any issue");
} finally {
rm1.stop();
rm2.stop();
}
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMRestart method testRMRestartNodeMapping.
@Test(timeout = 20000)
public void testRMRestartNodeMapping() throws Exception {
// Initial FS node label store root dir to a random tmp dir
File nodeLabelFsStoreDir = new File("target", this.getClass().getSimpleName() + "-testRMRestartNodeMapping");
if (nodeLabelFsStoreDir.exists()) {
FileUtils.deleteDirectory(nodeLabelFsStoreDir);
}
nodeLabelFsStoreDir.deleteOnExit();
String nodeLabelFsStoreDirURI = nodeLabelFsStoreDir.toURI().toString();
conf.set(YarnConfiguration.FS_NODE_LABELS_STORE_ROOT_DIR, nodeLabelFsStoreDirURI);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
rm1.init(conf);
rm1.start();
RMNodeLabelsManager nodeLabelManager = rm1.getRMContext().getNodeLabelManager();
Set<String> clusterNodeLabels = new HashSet<String>();
clusterNodeLabels.add("x");
clusterNodeLabels.add("y");
nodeLabelManager.addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
// Add node Label to Node h1->x
NodeId n1 = NodeId.newInstance("h1", 1234);
NodeId n2 = NodeId.newInstance("h1", 1235);
NodeId nihost = NodeId.newInstance("h1", 0);
nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(n1, toSet("x")));
nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(n2, toSet("x")));
nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(nihost, toSet("y")));
nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(n1, toSet("x")));
MockRM rm2 = null;
for (int i = 0; i < 2; i++) {
rm2 = new MockRM(conf, memStore) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
RMNodeLabelsManager mgr = new RMNodeLabelsManager();
mgr.init(getConfig());
return mgr;
}
};
rm2.init(conf);
rm2.start();
nodeLabelManager = rm2.getRMContext().getNodeLabelManager();
Map<String, Set<NodeId>> labelsToNodes = nodeLabelManager.getLabelsToNodes(toSet("x"));
Assert.assertEquals(1, null == labelsToNodes.get("x") ? 0 : labelsToNodes.get("x").size());
}
rm1.stop();
rm2.stop();
}
Aggregations