use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestWorkPreservingRMRestart method testDynamicQueueRecovery.
// Test work preserving recovery of apps running under reservation.
// This involves:
// 1. Setting up a dynamic reservable queue,
// 2. Submitting an app to it,
// 3. Failing over RM,
// 4. Validating that the app is recovered post failover,
// 5. Check if all running containers are recovered,
// 6. Verify the scheduler state like attempt info,
// 7. Verify the queue/user metrics for the dynamic reservable queue.
@Test(timeout = 30000)
public void testDynamicQueueRecovery() throws Exception {
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
// 1. Set up dynamic reservable queue.
Configuration schedulerConf = getSchedulerDynamicConfiguration();
int containerMemory = 1024;
Resource containerResource = Resource.newInstance(containerMemory, 1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(schedulerConf);
rm1 = new MockRM(schedulerConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
// 2. Run plan follower to update the added node & then submit app to
// dynamic queue.
rm1.getRMContext().getReservationSystem().synchronizePlan(ReservationSystemTestUtil.reservationQ, true);
RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName());
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
// clear queue metrics
rm1.clearQueueMetrics(app1);
// 3. Fail over (restart) RM.
rm2 = new MockRM(schedulerConf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// 4. Validate app is recovered post failover.
RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
RMAppAttempt loadedAttempt1 = recoveredApp1.getCurrentAppAttempt();
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
// Wait for RM to settle down on recovering containers.
waitForNumContainersToRecover(2, rm2, am1.getApplicationAttemptId());
Set<ContainerId> launchedContainers = ((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId())).getLaunchedContainers();
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
// 5. Check RMContainers are re-recreated and the container state is
// correct.
rm2.waitForState(nm1, amContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForState(nm1, runningContainer.getContainerId(), RMContainerState.RUNNING);
rm2.waitForContainerToComplete(loadedAttempt1, completedContainer);
AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm2.getResourceScheduler();
SchedulerNode schedulerNode1 = scheduler.getSchedulerNode(nm1.getNodeId());
// ********* check scheduler node state.*******
// 2 running containers.
Resource usedResources = Resources.multiply(containerResource, 2);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
assertTrue(schedulerNode1.isValidContainer(amContainer.getContainerId()));
assertTrue(schedulerNode1.isValidContainer(runningContainer.getContainerId()));
assertFalse(schedulerNode1.isValidContainer(completedContainer.getContainerId()));
// 2 launched containers, 1 completed container
assertEquals(2, schedulerNode1.getNumContainers());
assertEquals(Resources.subtract(nmResource, usedResources), schedulerNode1.getUnallocatedResource());
assertEquals(usedResources, schedulerNode1.getAllocatedResource());
Resource availableResources = Resources.subtract(nmResource, usedResources);
// 6. Verify the scheduler state like attempt info.
Map<ApplicationId, SchedulerApplication<SchedulerApplicationAttempt>> sa = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication<SchedulerApplicationAttempt> schedulerApp = sa.get(recoveredApp1.getApplicationId());
// 7. Verify the queue/user metrics for the dynamic reservable queue.
if (getSchedulerType() == SchedulerType.CAPACITY) {
checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2);
} else {
checkFSQueue(rm2, schedulerApp, usedResources, availableResources);
}
// *********** check scheduler attempt state.********
SchedulerApplicationAttempt schedulerAttempt = schedulerApp.getCurrentAppAttempt();
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(amContainer.getContainerId())));
assertTrue(schedulerAttempt.getLiveContainers().contains(scheduler.getRMContainer(runningContainer.getContainerId())));
assertEquals(schedulerAttempt.getCurrentConsumption(), usedResources);
// *********** check appSchedulingInfo state ***********
assertEquals((1L << 40) + 1L, schedulerAttempt.getNewContainerId());
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestZKRMStateStore method testFencedState.
@Test
public void testFencedState() throws Exception {
TestZKRMStateStoreTester zkTester = new TestZKRMStateStoreTester();
RMStateStore store = zkTester.getRMStateStore();
// Move state to FENCED from ACTIVE
store.updateFencedState();
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
long submitTime = System.currentTimeMillis();
long startTime = submitTime + 1000;
// Add a new app
RMApp mockApp = mock(RMApp.class);
ApplicationSubmissionContext context = new ApplicationSubmissionContextPBImpl();
when(mockApp.getSubmitTime()).thenReturn(submitTime);
when(mockApp.getStartTime()).thenReturn(startTime);
when(mockApp.getApplicationSubmissionContext()).thenReturn(context);
when(mockApp.getUser()).thenReturn("test");
store.storeNewApplication(mockApp);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// Add a new attempt
ClientToAMTokenSecretManagerInRM clientToAMTokenMgr = new ClientToAMTokenSecretManagerInRM();
ApplicationAttemptId attemptId = ApplicationAttemptId.fromString("appattempt_1234567894321_0001_000001");
SecretKey clientTokenMasterKey = clientToAMTokenMgr.createMasterKey(attemptId);
RMAppAttemptMetrics mockRmAppAttemptMetrics = mock(RMAppAttemptMetrics.class);
Container container = new ContainerPBImpl();
container.setId(ContainerId.fromString("container_1234567891234_0001_01_000001"));
RMAppAttempt mockAttempt = mock(RMAppAttempt.class);
when(mockAttempt.getAppAttemptId()).thenReturn(attemptId);
when(mockAttempt.getMasterContainer()).thenReturn(container);
when(mockAttempt.getClientTokenMasterKey()).thenReturn(clientTokenMasterKey);
when(mockAttempt.getRMAppAttemptMetrics()).thenReturn(mockRmAppAttemptMetrics);
when(mockRmAppAttemptMetrics.getAggregateAppResourceUsage()).thenReturn(new AggregateAppResourceUsage(0, 0));
store.storeNewApplicationAttempt(mockAttempt);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
long finishTime = submitTime + 1000;
// Update attempt
ApplicationAttemptStateData newAttemptState = ApplicationAttemptStateData.newInstance(attemptId, container, store.getCredentialsFromAppAttempt(mockAttempt), startTime, RMAppAttemptState.FINISHED, "testUrl", "test", FinalApplicationStatus.SUCCEEDED, 100, finishTime, 0, 0, 0, 0);
store.updateApplicationAttemptState(newAttemptState);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// Update app
ApplicationStateData appState = ApplicationStateData.newInstance(submitTime, startTime, context, "test");
store.updateApplicationState(appState);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// Remove app
store.removeApplication(mockApp);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// store RM delegation token;
RMDelegationTokenIdentifier dtId1 = new RMDelegationTokenIdentifier(new Text("owner1"), new Text("renewer1"), new Text("realuser1"));
Long renewDate1 = new Long(System.currentTimeMillis());
dtId1.setSequenceNumber(1111);
store.storeRMDelegationToken(dtId1, renewDate1);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
store.updateRMDelegationToken(dtId1, renewDate1);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// remove delegation key;
store.removeRMDelegationToken(dtId1);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// store delegation master key;
DelegationKey key = new DelegationKey(1234, 4321, "keyBytes".getBytes());
store.storeRMDTMasterKey(key);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// remove delegation master key;
store.removeRMDTMasterKey(key);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
// store or update AMRMToken;
store.storeOrUpdateAMRMTokenSecretManager(null, false);
assertEquals("RMStateStore should have been in fenced state", true, store.isFencedState());
store.close();
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class RMStateStoreTestBase method storeAttempt.
protected RMAppAttempt storeAttempt(RMStateStore store, ApplicationAttemptId attemptId, String containerIdStr, Token<AMRMTokenIdentifier> appToken, SecretKey clientTokenMasterKey, TestDispatcher dispatcher) throws Exception {
RMAppAttemptMetrics mockRmAppAttemptMetrics = mock(RMAppAttemptMetrics.class);
Container container = new ContainerPBImpl();
container.setId(ContainerId.fromString(containerIdStr));
RMAppAttempt mockAttempt = mock(RMAppAttempt.class);
when(mockAttempt.getAppAttemptId()).thenReturn(attemptId);
when(mockAttempt.getMasterContainer()).thenReturn(container);
when(mockAttempt.getAMRMToken()).thenReturn(appToken);
when(mockAttempt.getClientTokenMasterKey()).thenReturn(clientTokenMasterKey);
when(mockAttempt.getRMAppAttemptMetrics()).thenReturn(mockRmAppAttemptMetrics);
when(mockRmAppAttemptMetrics.getAggregateAppResourceUsage()).thenReturn(new AggregateAppResourceUsage(0, 0));
dispatcher.attemptId = attemptId;
store.storeNewApplicationAttempt(mockAttempt);
waitNotify(dispatcher);
return mockAttempt;
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestContainerAllocation method testAMContainerAllocationWhenDNSUnavailable.
// This is to test fetching AM container will be retried, if AM container is
// not fetchable since DNS is unavailable causing container token/NMtoken
// creation failure.
@Test(timeout = 30000)
public void testAMContainerAllocationWhenDNSUnavailable() throws Exception {
MockRM rm1 = new MockRM(conf) {
@Override
protected RMSecretManagerService createRMSecretManagerService() {
return new TestRMSecretManagerService(conf, rmContext);
}
};
rm1.start();
MockNM nm1 = rm1.registerNode("unknownhost:1234", 8000);
RMApp app1;
try {
SecurityUtilTestHelper.setTokenServiceUseIp(true);
app1 = rm1.submitApp(200);
RMAppAttempt attempt = app1.getCurrentAppAttempt();
nm1.nodeHeartbeat(true);
// fetching am container will fail, keep retrying 5 times.
while (numRetries <= 5) {
nm1.nodeHeartbeat(true);
Thread.sleep(1000);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, attempt.getAppAttemptState());
System.out.println("Waiting for am container to be allocated.");
}
} finally {
SecurityUtilTestHelper.setTokenServiceUseIp(false);
}
MockRM.launchAndRegisterAM(app1, rm1, nm1);
}
use of org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt in project hadoop by apache.
the class TestCapacitySchedulerNodeLabelUpdate method testBlacklistAMDisableLabel.
@Test(timeout = 30000)
public void testBlacklistAMDisableLabel() throws Exception {
conf.setBoolean(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED, true);
conf.setFloat(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD, 0.5f);
mgr.addToCluserNodeLabelsWithDefaultExclusivity(ImmutableSet.of("x", "y"));
mgr.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h2", 0), toSet("x"), NodeId.newInstance("h3", 0), toSet("x"), NodeId.newInstance("h6", 0), toSet("x")));
mgr.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h4", 0), toSet("y"), NodeId.newInstance("h5", 0), toSet("y"), NodeId.newInstance("h7", 0), toSet("y")));
MockRM rm = new MockRM(getConfigurationWithQueueLabels(conf)) {
@Override
public RMNodeLabelsManager createNodeLabelManager() {
return mgr;
}
};
rm.getRMContext().setNodeLabelManager(mgr);
rm.start();
// Nodes in label default h1,h8,h9
// Nodes in label x h2,h3,h6
// Nodes in label y h4,h5,h7
MockNM nm1 = rm.registerNode("h1:1234", 2048);
MockNM nm2 = rm.registerNode("h2:1234", 2048);
rm.registerNode("h3:1234", 2048);
rm.registerNode("h4:1234", 2048);
rm.registerNode("h5:1234", 2048);
rm.registerNode("h6:1234", 2048);
rm.registerNode("h7:1234", 2048);
rm.registerNode("h8:1234", 2048);
rm.registerNode("h9:1234", 2048);
// Submit app with AM container launched on default partition i.e. h1.
RMApp app = rm.submitApp(GB, "app", "user", null, "a");
MockRM.launchAndRegisterAM(app, rm, nm1);
RMAppAttempt appAttempt = app.getCurrentAppAttempt();
// Add default node blacklist from default
appAttempt.getAMBlacklistManager().addNode("h1");
ResourceBlacklistRequest blacklistUpdates = appAttempt.getAMBlacklistManager().getBlacklistUpdates();
Assert.assertEquals(1, blacklistUpdates.getBlacklistAdditions().size());
Assert.assertEquals(0, blacklistUpdates.getBlacklistRemovals().size());
// Adding second node from default parition
appAttempt.getAMBlacklistManager().addNode("h8");
blacklistUpdates = appAttempt.getAMBlacklistManager().getBlacklistUpdates();
Assert.assertEquals(0, blacklistUpdates.getBlacklistAdditions().size());
Assert.assertEquals(2, blacklistUpdates.getBlacklistRemovals().size());
// Submission in label x
RMApp applabel = rm.submitApp(GB, "app", "user", null, "a", "x");
MockRM.launchAndRegisterAM(applabel, rm, nm2);
RMAppAttempt appAttemptlabelx = applabel.getCurrentAppAttempt();
appAttemptlabelx.getAMBlacklistManager().addNode("h2");
ResourceBlacklistRequest blacklistUpdatesOnx = appAttemptlabelx.getAMBlacklistManager().getBlacklistUpdates();
Assert.assertEquals(1, blacklistUpdatesOnx.getBlacklistAdditions().size());
Assert.assertEquals(0, blacklistUpdatesOnx.getBlacklistRemovals().size());
// Adding second node from default parition
appAttemptlabelx.getAMBlacklistManager().addNode("h3");
blacklistUpdatesOnx = appAttempt.getAMBlacklistManager().getBlacklistUpdates();
Assert.assertEquals(0, blacklistUpdatesOnx.getBlacklistAdditions().size());
Assert.assertEquals(2, blacklistUpdatesOnx.getBlacklistRemovals().size());
rm.close();
}
Aggregations