Search in sources :

Example 6 with QueueMetrics

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.

the class TestRM method testApplicationKillAtAcceptedState.

/**
   * Validate killing an application when it is at accepted state.
   * @throws Exception exception
   */
@Test(timeout = 60000)
public void testApplicationKillAtAcceptedState() throws Exception {
    final Dispatcher dispatcher = new DrainDispatcher() {

        @Override
        public EventHandler<Event> getEventHandler() {
            class EventArgMatcher extends ArgumentMatcher<AbstractEvent> {

                @Override
                public boolean matches(Object argument) {
                    if (argument instanceof RMAppAttemptEvent) {
                        if (((RMAppAttemptEvent) argument).getType().equals(RMAppAttemptEventType.KILL)) {
                            return true;
                        }
                    }
                    return false;
                }
            }
            EventHandler handler = spy(super.getEventHandler());
            doNothing().when(handler).handle(argThat(new EventArgMatcher()));
            return handler;
        }
    };
    MockRM rm = new MockRM(conf) {

        @Override
        protected Dispatcher createDispatcher() {
            return dispatcher;
        }
    };
    // test metrics
    QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
    int appsKilled = metrics.getAppsKilled();
    int appsSubmitted = metrics.getAppsSubmitted();
    rm.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    nm1.registerNode();
    // a failed app
    RMApp application = rm.submitApp(200);
    MockAM am = MockRM.launchAM(application, rm, nm1);
    rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
    nm1.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    rm.waitForState(application.getApplicationId(), RMAppState.ACCEPTED);
    // Now kill the application before new attempt is launched, the app report
    // returns the invalid AM host and port.
    KillApplicationRequest request = KillApplicationRequest.newInstance(application.getApplicationId());
    rm.getClientRMService().forceKillApplication(request);
    // Specific test for YARN-1689 follows
    // Now let's say a race causes AM to register now. This should not crash RM.
    am.registerAppAttempt(false);
    // We explicitly intercepted the kill-event to RMAppAttempt, so app should
    // still be in KILLING state.
    rm.waitForState(application.getApplicationId(), RMAppState.KILLING);
    // AM should now be in running
    rm.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Simulate that appAttempt is killed.
    rm.getRMContext().getDispatcher().getEventHandler().handle(new RMAppEvent(application.getApplicationId(), RMAppEventType.ATTEMPT_KILLED));
    rm.waitForState(application.getApplicationId(), RMAppState.KILLED);
    // test metrics
    metrics = rm.getResourceScheduler().getRootQueueMetrics();
    Assert.assertEquals(appsKilled + 1, metrics.getAppsKilled());
    Assert.assertEquals(appsSubmitted + 1, metrics.getAppsSubmitted());
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) EventHandler(org.apache.hadoop.yarn.event.EventHandler) KillApplicationRequest(org.apache.hadoop.yarn.api.protocolrecords.KillApplicationRequest) Dispatcher(org.apache.hadoop.yarn.event.Dispatcher) DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMAppEvent(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) ArgumentMatcher(org.mockito.ArgumentMatcher) AbstractEvent(org.apache.hadoop.yarn.event.AbstractEvent) Event(org.apache.hadoop.yarn.event.Event) RMAppAttemptEvent(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent) RMAppEvent(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent) RMAppAttemptEvent(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent) Test(org.junit.Test)

Example 7 with QueueMetrics

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.

the class TestResourceTrackerService method testReconnectNode.

@Test
public void testReconnectNode() throws Exception {
    rm = new MockRM() {

        @Override
        protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
            return new EventDispatcher<SchedulerEvent>(this.scheduler, this.scheduler.getClass().getName()) {

                @Override
                public void handle(SchedulerEvent event) {
                    scheduler.handle(event);
                }
            };
        }
    };
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 5120);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(false);
    rm.drainEvents();
    checkUnhealthyNMCount(rm, nm2, true, 1);
    final int expectedNMs = ClusterMetrics.getMetrics().getNumActiveNMs();
    QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
    // TODO Metrics incorrect in case of the FifoScheduler
    Assert.assertEquals(5120, metrics.getAvailableMB());
    // reconnect of healthy node
    nm1 = rm.registerNode("host1:1234", 5120);
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    rm.drainEvents();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnhealthyNMCount(rm, nm2, true, 1);
    // reconnect of unhealthy node
    nm2 = rm.registerNode("host2:5678", 5120);
    response = nm2.nodeHeartbeat(false);
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    rm.drainEvents();
    Assert.assertEquals(expectedNMs, ClusterMetrics.getMetrics().getNumActiveNMs());
    checkUnhealthyNMCount(rm, nm2, true, 1);
    // unhealthy node changed back to healthy
    nm2 = rm.registerNode("host2:5678", 5120);
    response = nm2.nodeHeartbeat(true);
    response = nm2.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertEquals(5120 + 5120, metrics.getAvailableMB());
    // reconnect of node with changed capability
    nm1 = rm.registerNode("host2:5678", 10240);
    response = nm1.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 10240, metrics.getAvailableMB());
    // reconnect of node with changed capability and running applications
    List<ApplicationId> runningApps = new ArrayList<ApplicationId>();
    runningApps.add(ApplicationId.newInstance(1, 0));
    nm1 = rm.registerNode("host2:5678", 15360, 2, runningApps);
    response = nm1.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertTrue(NodeAction.NORMAL.equals(response.getNodeAction()));
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
    // reconnect healthy node changing http port
    nm1 = new MockNM("host1:1234", 5120, rm.getResourceTrackerService());
    nm1.setHttpPort(3);
    nm1.registerNode();
    response = nm1.nodeHeartbeat(true);
    response = nm1.nodeHeartbeat(true);
    rm.drainEvents();
    RMNode rmNode = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    Assert.assertEquals(3, rmNode.getHttpPort());
    Assert.assertEquals(5120, rmNode.getTotalCapability().getMemorySize());
    Assert.assertEquals(5120 + 15360, metrics.getAvailableMB());
}
Also used : NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) ArrayList(java.util.ArrayList) EventHandler(org.apache.hadoop.yarn.event.EventHandler) SchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Test(org.junit.Test)

Example 8 with QueueMetrics

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.

the class TestFairScheduler method testSimpleContainerAllocation.

@Test(timeout = 5000)
public void testSimpleContainerAllocation() throws IOException {
    scheduler.init(conf);
    scheduler.start();
    scheduler.reinitialize(conf, resourceManager.getRMContext());
    // Add a node
    RMNode node1 = MockNodes.newNodeInfo(1, Resources.createResource(1024, 4), 1, "127.0.0.1");
    NodeAddedSchedulerEvent nodeEvent1 = new NodeAddedSchedulerEvent(node1);
    scheduler.handle(nodeEvent1);
    // Add another node
    RMNode node2 = MockNodes.newNodeInfo(1, Resources.createResource(512, 2), 2, "127.0.0.2");
    NodeAddedSchedulerEvent nodeEvent2 = new NodeAddedSchedulerEvent(node2);
    scheduler.handle(nodeEvent2);
    createSchedulingRequest(512, 2, "queue1", "user1", 2);
    scheduler.update();
    NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node1);
    scheduler.handle(updateEvent);
    // Asked for less than increment allocation.
    assertEquals(FairSchedulerConfiguration.DEFAULT_RM_SCHEDULER_INCREMENT_ALLOCATION_MB, scheduler.getQueueManager().getQueue("queue1").getResourceUsage().getMemorySize());
    NodeUpdateSchedulerEvent updateEvent2 = new NodeUpdateSchedulerEvent(node2);
    scheduler.handle(updateEvent2);
    assertEquals(1024, scheduler.getQueueManager().getQueue("queue1").getResourceUsage().getMemorySize());
    assertEquals(2, scheduler.getQueueManager().getQueue("queue1").getResourceUsage().getVirtualCores());
    // verify metrics
    QueueMetrics queue1Metrics = scheduler.getQueueManager().getQueue("queue1").getMetrics();
    assertEquals(1024, queue1Metrics.getAllocatedMB());
    assertEquals(2, queue1Metrics.getAllocatedVirtualCores());
    assertEquals(1024, scheduler.getRootQueueMetrics().getAllocatedMB());
    assertEquals(2, scheduler.getRootQueueMetrics().getAllocatedVirtualCores());
    assertEquals(512, scheduler.getRootQueueMetrics().getAvailableMB());
    assertEquals(4, scheduler.getRootQueueMetrics().getAvailableVirtualCores());
}
Also used : NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) NodeAddedSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent) Test(org.junit.Test)

Example 9 with QueueMetrics

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.

the class TestFairScheduler method testReservationMetrics.

@Test
public void testReservationMetrics() throws IOException {
    scheduler.init(conf);
    scheduler.start();
    scheduler.reinitialize(conf, resourceManager.getRMContext());
    QueueMetrics metrics = scheduler.getRootQueueMetrics();
    RMNode node1 = MockNodes.newNodeInfo(1, Resources.createResource(4096, 4), 1, "127.0.0.1");
    NodeAddedSchedulerEvent nodeEvent = new NodeAddedSchedulerEvent(node1);
    scheduler.handle(nodeEvent);
    ApplicationAttemptId appAttemptId = createAppAttemptId(1, 1);
    createApplicationWithAMResource(appAttemptId, "default", "user1", null);
    NodeUpdateSchedulerEvent updateEvent = new NodeUpdateSchedulerEvent(node1);
    scheduler.update();
    scheduler.handle(updateEvent);
    createSchedulingRequestExistingApplication(1024, 1, 1, appAttemptId);
    scheduler.update();
    scheduler.handle(updateEvent);
    // no reservation yet
    assertEquals(0, metrics.getReservedContainers());
    assertEquals(0, metrics.getReservedMB());
    assertEquals(0, metrics.getReservedVirtualCores());
    // create reservation of {4096, 4}
    createSchedulingRequestExistingApplication(4096, 4, 1, appAttemptId);
    scheduler.update();
    scheduler.handle(updateEvent);
    // reservation created
    assertEquals(1, metrics.getReservedContainers());
    assertEquals(4096, metrics.getReservedMB());
    assertEquals(4, metrics.getReservedVirtualCores());
    // remove AppAttempt
    AppAttemptRemovedSchedulerEvent attRemoveEvent = new AppAttemptRemovedSchedulerEvent(appAttemptId, RMAppAttemptState.KILLED, false);
    scheduler.handle(attRemoveEvent);
    // The reservation metrics should be subtracted
    assertEquals(0, metrics.getReservedContainers());
    assertEquals(0, metrics.getReservedMB());
    assertEquals(0, metrics.getReservedVirtualCores());
}
Also used : QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) NodeAddedSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent) AppAttemptRemovedSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) Test(org.junit.Test)

Example 10 with QueueMetrics

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics in project hadoop by apache.

the class TestWorkPreservingRMRestart method checkFSQueue.

private void checkFSQueue(ResourceManager rm, SchedulerApplication schedulerApp, Resource usedResources, Resource availableResources) throws Exception {
    // waiting for RM's scheduling apps
    int retry = 0;
    Resource assumedFairShare = Resource.newInstance(8192, 8);
    while (true) {
        Thread.sleep(100);
        if (assumedFairShare.equals(((FairScheduler) rm.getResourceScheduler()).getQueueManager().getRootQueue().getFairShare())) {
            break;
        }
        retry++;
        if (retry > 30) {
            Assert.fail("Apps are not scheduled within assumed timeout");
        }
    }
    FairScheduler scheduler = (FairScheduler) rm.getResourceScheduler();
    FSParentQueue root = scheduler.getQueueManager().getRootQueue();
    // ************ check cluster used Resources ********
    assertTrue(root.getPolicy() instanceof DominantResourceFairnessPolicy);
    assertEquals(usedResources, root.getResourceUsage());
    // ************ check app headroom ****************
    FSAppAttempt schedulerAttempt = (FSAppAttempt) schedulerApp.getCurrentAppAttempt();
    assertEquals(availableResources, schedulerAttempt.getHeadroom());
    // ************ check queue metrics ****************
    QueueMetrics queueMetrics = scheduler.getRootQueueMetrics();
    assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(), availableResources.getVirtualCores(), usedResources.getMemorySize(), usedResources.getVirtualCores());
}
Also used : QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) FairScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler) FSAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSAppAttempt) Resource(org.apache.hadoop.yarn.api.records.Resource) DominantResourceFairnessPolicy(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.policies.DominantResourceFairnessPolicy) FSParentQueue(org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue)

Aggregations

QueueMetrics (org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics)25 Test (org.junit.Test)14 Resource (org.apache.hadoop.yarn.api.records.Resource)10 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)9 RMContext (org.apache.hadoop.yarn.server.resourcemanager.RMContext)7 RMNode (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode)7 NodeAddedSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent)7 NodeUpdateSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent)7 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)6 Container (org.apache.hadoop.yarn.api.records.Container)5 Before (org.junit.Before)5 ArrayList (java.util.ArrayList)4 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)4 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)4 HashMap (java.util.HashMap)3 AllocateResponse (org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse)3 NodeId (org.apache.hadoop.yarn.api.records.NodeId)3 DistributedSchedulingAllocateResponse (org.apache.hadoop.yarn.server.api.protocolrecords.DistributedSchedulingAllocateResponse)3 InMemoryPlan (org.apache.hadoop.yarn.server.resourcemanager.reservation.InMemoryPlan)3 ReservationSchedulerConfiguration (org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationSchedulerConfiguration)3