use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.
the class TestRMRestart method testQueueMetricsOnRMRestart.
@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testQueueMetricsOnRMRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// PHASE 1: create state in an RM
// start RM
MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
QueueMetrics qm1 = rm1.getResourceScheduler().getRootQueueMetrics();
resetQueueMetrics(qm1);
assertQueueMetrics(qm1, 0, 0, 0, 0);
// create app that gets launched and does allocate before RM restart
RMApp app1 = rm1.submitApp(200);
// Need to wait first for AppAttempt to be started (RMAppState.ACCEPTED)
// and then for it to reach RMAppAttemptState.SCHEDULED
// inorder to ensure appsPending metric is incremented
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
rm1.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
assertQueueMetrics(qm1, 1, 1, 0, 0);
nm1.nodeHeartbeat(true);
rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
assertQueueMetrics(qm1, 1, 0, 1, 0);
// PHASE 2: create new RM and start from old state
// create new RM to represent restart and recover state
MockRM rm2 = createMockRM(conf, memStore);
QueueMetrics qm2 = rm2.getResourceScheduler().getRootQueueMetrics();
resetQueueMetrics(qm2);
assertQueueMetrics(qm2, 0, 0, 0, 0);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
// recover app
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
nm1.nodeHeartbeat(true);
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
while (loadedApp1.getAppAttempts().size() != 2) {
Thread.sleep(200);
}
attempt1 = loadedApp1.getCurrentAppAttempt();
attemptId1 = attempt1.getAppAttemptId();
rm2.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
assertQueueMetrics(qm2, 1, 1, 0, 0);
nm1.nodeHeartbeat(true);
rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
assertQueueMetrics(qm2, 1, 0, 1, 0);
am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// finish the AMs
finishApplicationMaster(loadedApp1, rm2, nm1, am1);
// now AppAttempt and App becomes FINISHED,
// we should also grant APP_ATTEMPT_REMOVE/APP_REMOVE event
// had processed by scheduler
rm2.waitForAppRemovedFromScheduler(loadedApp1.getApplicationId());
assertQueueMetrics(qm2, 1, 0, 0, 1);
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.
the class TestRMHA method verifyClusterMetrics.
private void verifyClusterMetrics(int activeNodes, int appsSubmitted, int appsPending, int containersPending, long availableMB, int activeApplications) throws Exception {
int timeoutSecs = 0;
QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
boolean isAllMetricAssertionDone = false;
String message = null;
while (timeoutSecs++ < 5) {
try {
// verify queue metrics
assertMetric("appsSubmitted", appsSubmitted, metrics.getAppsSubmitted());
assertMetric("appsPending", appsPending, metrics.getAppsPending());
assertMetric("containersPending", containersPending, metrics.getPendingContainers());
assertMetric("availableMB", availableMB, metrics.getAvailableMB());
assertMetric("activeApplications", activeApplications, metrics.getActiveApps());
// verify node metric
assertMetric("activeNodes", activeNodes, clusterMetrics.getNumActiveNMs());
isAllMetricAssertionDone = true;
break;
} catch (AssertionError e) {
message = e.getMessage();
System.out.println("Waiting for metrics assertion to complete");
Thread.sleep(1000);
}
}
assertTrue(message, isAllMetricAssertionDone);
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.
the class TestRM method testApplicationKillAtAcceptedState.
/**
* Validate killing an application when it is at accepted state.
* @throws Exception exception
*/
@Test(timeout = 60000)
public void testApplicationKillAtAcceptedState() throws Exception {
final Dispatcher dispatcher = new DrainDispatcher() {
@Override
public EventHandler<Event> getEventHandler() {
class EventArgMatcher extends ArgumentMatcher<AbstractEvent> {
@Override
public boolean matches(Object argument) {
if (argument instanceof RMAppAttemptEvent) {
if (((RMAppAttemptEvent) argument).getType().equals(RMAppAttemptEventType.KILL)) {
return true;
}
}
return false;
}
}
EventHandler handler = spy(super.getEventHandler());
doNothing().when(handler).handle(argThat(new EventArgMatcher()));
return handler;
}
};
MockRM rm = new MockRM(conf) {
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
// test metrics
QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
int appsKilled = metrics.getAppsKilled();
int appsSubmitted = metrics.getAppsSubmitted();
rm.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
nm1.registerNode();
// a failed app
RMApp application = rm.submitApp(200);
MockAM am = MockRM.launchAM(application, rm, nm1);
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
nm1.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.RUNNING);
rm.waitForState(application.getApplicationId(), RMAppState.ACCEPTED);
// Now kill the application before new attempt is launched, the app report
// returns the invalid AM host and port.
KillApplicationRequest request = KillApplicationRequest.newInstance(application.getApplicationId());
rm.getClientRMService().forceKillApplication(request);
// Specific test for YARN-1689 follows
// Now let's say a race causes AM to register now. This should not crash RM.
am.registerAppAttempt(false);
// We explicitly intercepted the kill-event to RMAppAttempt, so app should
// still be in KILLING state.
rm.waitForState(application.getApplicationId(), RMAppState.KILLING);
// AM should now be in running
rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
// Simulate that appAttempt is killed.
rm.getRMContext().getDispatcher().getEventHandler().handle(new RMAppEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_KILLED));
rm.waitForState(application.getApplicationId(), RMAppState.KILLED);
// test metrics
metrics = rm.getResourceScheduler().getRootQueueMetrics();
Assert.assertEquals(appsKilled + 1, metrics.getAppsKilled());
Assert.assertEquals(appsSubmitted + 1, metrics.getAppsSubmitted());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.
the class TestResourceTrackerService method testReconnectNode.
@Test
public void testReconnectNode() throws Exception {
rm = new MockRM() {
@Override
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
return new EventDispatcher<SchedulerEvent>(this.scheduler, this.scheduler.getClass().getName()) {
@Override
public void handle(SchedulerEvent event) {
scheduler.handle(event);
}
};
}
};
rm.start();
MockNM nm1 = rm.registerNode("host1:1234", 5120);
MockNM nm2 = rm.registerNode("host2:5678", 5120);
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(false);
rm.drainEvents();
checkUnhealthyNMCount(rm, nm2, true, 1);
final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs();
QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
// TODO Metrics incorrect in case of the FifoScheduler
Assert.assertEquals(5120, metrics.getAvailableMB());
// reconnect of healthy node
nm1 = rm.registerNode("host1:1234", 5120);
NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
rm.drainEvents();
Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
checkUnhealthyNMCount(rm, nm2, true, 1);
// reconnect of unhealthy node
nm2 = rm.registerNode("host2:5678", 5120);
response = nm2.nodeHeartbeat(false);
Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
rm.drainEvents();
Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
checkUnhealthyNMCount(rm, nm2, true, 1);
// unhealthy node changed back to healthy
nm2 = rm.registerNode("host2:5678", 5120);
response = nm2.nodeHeartbeat(true);
response = nm2.nodeHeartbeat(true);
rm.drainEvents();
Assert.assertEquals(5120 + 5120, metrics.getAvailableMB());
// reconnect of node with changed capability
nm1 = rm.registerNode("host2:5678", 10240);
response = nm1.nodeHeartbeat(true);
rm.drainEvents();
Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
Assert.assertEquals(5120 + 10240, metrics.getAvailableMB());
// reconnect of node with changed capability and running applications
List<ApplicationId> runningApps = new ArrayList<ApplicationId>();
runningApps.add(ApplicationId.newInstance(1, 0));
nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps);
response = nm1.nodeHeartbeat(true);
rm.drainEvents();
Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
// reconnect healthy node changing http port
nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
nm1.setHttpPort(3);
nm1.registerNode();
response = nm1.nodeHeartbeat(true);
response = nm1.nodeHeartbeat(true);
rm.drainEvents();
RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
Assert.assertEquals(3, rmNode.getHttpPort());
Assert.assertEquals(5120, rmNode.getTotalCapability().getMemorySize());
Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.
the class TestCapacityScheduler method testRemoveAttemptMoveAdded.
@Test
public void testRemoveAttemptMoveAdded() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, CapacityScheduler.class);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
// Create Mock RM
MockRM rm = new MockRM(getCapacityConfiguration(conf));
CapacityScheduler sch = (CapacityScheduler) rm.getResourceScheduler();
// add node
Resource newResource = Resource.newInstance(4 * GB, 1);
RMNode node = MockNodes.newNodeInfo(0, newResource, 1, "127.0.0.1");
SchedulerEvent addNode = new NodeAddedSchedulerEvent(node);
sch.handle(addNode);
// create appid
ApplicationId appId = BuilderUtils.newApplicationId(100, 1);
ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId(appId, 1);
RMAppAttemptMetrics attemptMetric = new RMAppAttemptMetrics(appAttemptId, rm.getRMContext());
RMAppImpl app = mock(RMAppImpl.class);
when(app.getApplicationId()).thenReturn(appId);
RMAppAttemptImpl attempt = mock(RMAppAttemptImpl.class);
Container container = mock(Container.class);
when(attempt.getMasterContainer()).thenReturn(container);
ApplicationSubmissionContext submissionContext = mock(ApplicationSubmissionContext.class);
when(attempt.getSubmissionContext()).thenReturn(submissionContext);
when(attempt.getAppAttemptId()).thenReturn(appAttemptId);
when(attempt.getRMAppAttemptMetrics()).thenReturn(attemptMetric);
when(app.getCurrentAppAttempt()).thenReturn(attempt);
rm.getRMContext().getRMApps().put(appId, app);
// Add application
SchedulerEvent addAppEvent = new AppAddedSchedulerEvent(appId, "a1", "user");
sch.handle(addAppEvent);
// Add application attempt
SchedulerEvent addAttemptEvent = new AppAttemptAddedSchedulerEvent(appAttemptId, false);
sch.handle(addAttemptEvent);
// get Queues
CSQueue queueA1 = sch.getQueue("a1");
CSQueue queueB = sch.getQueue("b");
CSQueue queueB1 = sch.getQueue("b1");
// add Running rm container and simulate live containers to a1
ContainerId newContainerId = ContainerId.newContainerId(appAttemptId, 2);
RMContainerImpl rmContainer = mock(RMContainerImpl.class);
when(rmContainer.getState()).thenReturn(RMContainerState.RUNNING);
Container container2 = mock(Container.class);
when(rmContainer.getContainer()).thenReturn(container2);
Resource resource = Resource.newInstance(1024, 1);
when(container2.getResource()).thenReturn(resource);
when(rmContainer.getExecutionType()).thenReturn(ExecutionType.GUARANTEED);
when(container2.getNodeId()).thenReturn(node.getNodeID());
when(container2.getId()).thenReturn(newContainerId);
when(rmContainer.getNodeLabelExpression()).thenReturn(RMNodeLabelsManager.NO_LABEL);
when(rmContainer.getContainerId()).thenReturn(newContainerId);
sch.getApplicationAttempt(appAttemptId).getLiveContainersMap().put(newContainerId, rmContainer);
QueueMetrics queueA1M = queueA1.getMetrics();
queueA1M.incrPendingResources("user1", 1, resource);
queueA1M.allocateResources("user1", resource);
// remove attempt
sch.handle(new AppAttemptRemovedSchedulerEvent(appAttemptId, RMAppAttemptState.KILLED, true));
// Move application to queue b1
sch.moveApplication(appId, "b1");
// Check queue metrics after move
Assert.assertEquals(0, queueA1.getNumApplications());
Assert.assertEquals(1, queueB.getNumApplications());
Assert.assertEquals(0, queueB1.getNumApplications());
// Release attempt add event
ApplicationAttemptId appAttemptId2 = BuilderUtils.newApplicationAttemptId(appId, 2);
SchedulerEvent addAttemptEvent2 = new AppAttemptAddedSchedulerEvent(appAttemptId2, true);
sch.handle(addAttemptEvent2);
// Check metrics after attempt added
Assert.assertEquals(0, queueA1.getNumApplications());
Assert.assertEquals(1, queueB.getNumApplications());
Assert.assertEquals(1, queueB1.getNumApplications());
QueueMetrics queueB1M = queueB1.getMetrics();
QueueMetrics queueBM = queueB.getMetrics();
// Verify allocation MB of current state
Assert.assertEquals(0, queueA1M.getAllocatedMB());
Assert.assertEquals(0, queueA1M.getAllocatedVirtualCores());
Assert.assertEquals(1024, queueB1M.getAllocatedMB());
Assert.assertEquals(1, queueB1M.getAllocatedVirtualCores());
// remove attempt
sch.handle(new AppAttemptRemovedSchedulerEvent(appAttemptId2, RMAppAttemptState.FINISHED, false));
Assert.assertEquals(0, queueA1M.getAllocatedMB());
Assert.assertEquals(0, queueA1M.getAllocatedVirtualCores());
Assert.assertEquals(0, queueB1M.getAllocatedMB());
Assert.assertEquals(0, queueB1M.getAllocatedVirtualCores());
verifyQueueMetrics(queueB1M);
verifyQueueMetrics(queueBM);
// Verify queue A1 metrics
verifyQueueMetrics(queueA1M);
rm.close();
}
Aggregations