use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class RMStateStoreTestBase method testRemoveAttempt.
public void testRemoveAttempt(RMStateStoreHelper stateStoreHelper) throws Exception {
RMStateStore store = stateStoreHelper.getRMStateStore();
TestDispatcher dispatcher = new TestDispatcher();
store.setRMDispatcher(dispatcher);
ApplicationId appId = ApplicationId.newInstance(1383183339, 6);
storeApp(store, appId, 123456, 564321);
ApplicationAttemptId attemptId1 = ApplicationAttemptId.newInstance(appId, 1);
RMAppAttempt attempt1 = storeAttempt(store, attemptId1, ContainerId.newContainerId(attemptId1, 1).toString(), null, null, dispatcher);
ApplicationAttemptId attemptId2 = ApplicationAttemptId.newInstance(appId, 2);
RMAppAttempt attempt2 = storeAttempt(store, attemptId2, ContainerId.newContainerId(attemptId2, 1).toString(), null, null, dispatcher);
store.removeApplicationAttemptInternal(attemptId1);
Assert.assertFalse(stateStoreHelper.attemptExists(attempt1));
Assert.assertTrue(stateStoreHelper.attemptExists(attempt2));
// let things settle down
Thread.sleep(1000);
store.close();
// load state
store = stateStoreHelper.getRMStateStore();
RMState state = store.loadState();
Map<ApplicationId, ApplicationStateData> rmAppState = state.getApplicationState();
ApplicationStateData appState = rmAppState.get(appId);
// app is loaded
assertNotNull(appState);
assertEquals(2, appState.getFirstAttemptId());
assertNull(appState.getAttempt(attemptId1));
assertNotNull(appState.getAttempt(attemptId2));
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestAMRestart method testRMRestartOrFailoverNotCountedForAMFailures.
// Test regular RM restart/failover, new RM should not count
// AM failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 50000)
public void testRMRestartOrFailoverNotCountedForAMFailures() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
// AM should be restarted even though max-am-attempt is 1.
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
// Restart rm.
MockRM rm2 = new MockRM(conf, memStore);
rm2.start();
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// re-register the NM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
status.setContainerId(attempt1.getMasterContainer().getId());
status.setContainerState(ContainerState.COMPLETE);
status.setDiagnostics("");
nm1.registerNode(Collections.singletonList(status), null);
rm2.waitForState(attempt1.getAppAttemptId(), RMAppAttemptState.FAILED);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
// Will automatically start a new AppAttempt in rm2
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
RMAppAttempt attempt3 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
Assert.assertTrue(attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestWorkPreservingRMRestart method testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart.
@Test(timeout = 20000)
public void testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart() throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
// submit app with keepContainersAcrossApplicationAttempts true
Resource resource = Records.newRecord(Resource.class);
resource.setMemorySize(200);
RMApp app0 = rm1.submitApp(resource, "", UserGroupInformation.getCurrentUser().getShortUserName(), null, false, null, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS, null, null, true, true, false, null, 0, null, true, null);
MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
am0.allocate("127.0.0.1", 1000, 2, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
List<Container> conts = am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// am failed,and relaunch it
nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
MockAM am1 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
// rm failover
rm2 = new MockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// container launched by first am completed
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 2, ContainerState.COMPLETE);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 3, ContainerState.RUNNING);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
Thread.sleep(200);
// check whether current am could get containerCompleteMsg
RMApp recoveredApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp0.getCurrentAppAttempt();
assertEquals(1, loadedAttempt1.getJustFinishedContainers().size());
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestRMApplicationHistoryWriter method testRMWritingMassiveHistory.
private void testRMWritingMassiveHistory(MockRM rm) throws Exception {
rm.start();
MockNM nm = rm.registerNode("127.0.0.1:1234", 1024 * 10100);
RMApp app = rm.submitApp(1024);
//Wait to make sure the attempt has the right state
//TODO explore a better way than sleeping for a while (YARN-4929)
Thread.sleep(1000);
nm.nodeHeartbeat(true);
RMAppAttempt attempt = app.getCurrentAppAttempt();
MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());
am.registerAppAttempt();
int request = 10000;
am.allocate("127.0.0.1", 1024, request, new ArrayList<ContainerId>());
nm.nodeHeartbeat(true);
List<Container> allocated = am.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
int waitCount = 0;
int allocatedSize = allocated.size();
while (allocatedSize < request && waitCount++ < 200) {
Thread.sleep(300);
allocated = am.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
allocatedSize += allocated.size();
nm.nodeHeartbeat(true);
}
Assert.assertEquals(request, allocatedSize);
am.unregisterAppAttempt();
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
nm.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
NodeHeartbeatResponse resp = nm.nodeHeartbeat(true);
List<ContainerId> cleaned = resp.getContainersToCleanup();
int cleanedSize = cleaned.size();
waitCount = 0;
while (cleanedSize < allocatedSize && waitCount++ < 200) {
Thread.sleep(300);
resp = nm.nodeHeartbeat(true);
cleaned = resp.getContainersToCleanup();
cleanedSize += cleaned.size();
}
Assert.assertEquals(allocatedSize, cleanedSize);
rm.waitForState(app.getApplicationId(), RMAppState.FINISHED);
rm.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestRMApplicationHistoryWriter method testWriteApplicationAttempt.
@Test
public void testWriteApplicationAttempt() throws Exception {
RMAppAttempt appAttempt = createRMAppAttempt(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1));
writer.applicationAttemptStarted(appAttempt);
ApplicationAttemptHistoryData appAttemptHD = null;
for (int i = 0; i < MAX_RETRIES; ++i) {
appAttemptHD = store.getApplicationAttempt(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1));
if (appAttemptHD != null) {
break;
} else {
Thread.sleep(100);
}
}
Assert.assertNotNull(appAttemptHD);
Assert.assertEquals("test host", appAttemptHD.getHost());
Assert.assertEquals(-100, appAttemptHD.getRPCPort());
Assert.assertEquals(ContainerId.newContainerId(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1), 1), appAttemptHD.getMasterContainerId());
writer.applicationAttemptFinished(appAttempt, RMAppAttemptState.FINISHED);
for (int i = 0; i < MAX_RETRIES; ++i) {
appAttemptHD = store.getApplicationAttempt(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 1));
if (appAttemptHD.getYarnApplicationAttemptState() != null) {
break;
} else {
Thread.sleep(100);
}
}
Assert.assertEquals("test diagnostics info", appAttemptHD.getDiagnosticsInfo());
Assert.assertEquals("test url", appAttemptHD.getTrackingURL());
Assert.assertEquals(FinalApplicationStatus.UNDEFINED, appAttemptHD.getFinalApplicationStatus());
Assert.assertEquals(YarnApplicationAttemptState.FINISHED, appAttemptHD.getYarnApplicationAttemptState());
}
Aggregations