Examples with DrainDispatcher - org.apache.hadoop.yarn.event.DrainDispatcher

Example 1 with DrainDispatcher

use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.

the class TestChildQueueOrder method testSortedQueues.

@Test
@SuppressWarnings("unchecked")
public void testSortedQueues() throws Exception {
    // Setup queue configs
    setupSortedQueues(csConf);
    Map<String, CSQueue> queues = new HashMap<String, CSQueue>();
    CSQueue root = CapacitySchedulerQueueManager.parseQueue(csContext, csConf, null, CapacitySchedulerConfiguration.ROOT, queues, queues, TestUtils.spyHook);
    // Setup some nodes
    final int memoryPerNode = 10;
    final int coresPerNode = 16;
    final int numNodes = 1;
    FiCaSchedulerNode node_0 = TestUtils.getMockNode("host_0", DEFAULT_RACK, 0, memoryPerNode * GB);
    doNothing().when(node_0).releaseContainer(any(ContainerId.class), anyBoolean());
    final Resource clusterResource = Resources.createResource(numNodes * (memoryPerNode * GB), numNodes * coresPerNode);
    when(csContext.getNumClusterNodes()).thenReturn(numNodes);
    // Start testing
    CSQueue a = queues.get(A);
    CSQueue b = queues.get(B);
    CSQueue c = queues.get(C);
    CSQueue d = queues.get(D);
    // Make a/b/c/d has >0 pending resource, so that allocation will continue.
    queues.get(CapacitySchedulerConfiguration.ROOT).getQueueResourceUsage().incPending(Resources.createResource(1 * GB));
    a.getQueueResourceUsage().incPending(Resources.createResource(1 * GB));
    b.getQueueResourceUsage().incPending(Resources.createResource(1 * GB));
    c.getQueueResourceUsage().incPending(Resources.createResource(1 * GB));
    d.getQueueResourceUsage().incPending(Resources.createResource(1 * GB));
    final String user_0 = "user_0";
    // Stub an App and its containerCompleted
    FiCaSchedulerApp app_0 = getMockApplication(0, user_0);
    doReturn(true).when(app_0).containerCompleted(any(RMContainer.class), any(ContainerStatus.class), any(RMContainerEventType.class), any(String.class));
    Priority priority = TestUtils.createMockPriority(1);
    ContainerAllocationExpirer expirer = mock(ContainerAllocationExpirer.class);
    DrainDispatcher drainDispatcher = new DrainDispatcher();
    RMApplicationHistoryWriter writer = mock(RMApplicationHistoryWriter.class);
    SystemMetricsPublisher publisher = mock(SystemMetricsPublisher.class);
    RMContext rmContext = mock(RMContext.class);
    when(rmContext.getContainerAllocationExpirer()).thenReturn(expirer);
    when(rmContext.getDispatcher()).thenReturn(drainDispatcher);
    when(rmContext.getRMApplicationHistoryWriter()).thenReturn(writer);
    when(rmContext.getSystemMetricsPublisher()).thenReturn(publisher);
    when(rmContext.getYarnConfiguration()).thenReturn(new YarnConfiguration());
    ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId(app_0.getApplicationId(), 1);
    ContainerId containerId = BuilderUtils.newContainerId(appAttemptId, 1);
    Container container = TestUtils.getMockContainer(containerId, node_0.getNodeID(), Resources.createResource(1 * GB), priority);
    RMContainer rmContainer = new RMContainerImpl(container, SchedulerRequestKey.extractFrom(container), appAttemptId, node_0.getNodeID(), "user", rmContext);
    // Assign {1,2,3,4} 1GB containers respectively to queues
    stubQueueAllocation(a, clusterResource, node_0, 1 * GB);
    stubQueueAllocation(b, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
    root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    for (int i = 0; i < 2; i++) {
        stubQueueAllocation(a, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(b, clusterResource, node_0, 1 * GB);
        stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
        root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    }
    for (int i = 0; i < 3; i++) {
        stubQueueAllocation(a, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(b, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(c, clusterResource, node_0, 1 * GB);
        stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
        root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    }
    for (int i = 0; i < 4; i++) {
        stubQueueAllocation(a, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(b, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(d, clusterResource, node_0, 1 * GB);
        root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    }
    verifyQueueMetrics(a, 1 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 4 * GB, clusterResource);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    //Release 3 x 1GB containers from D
    for (int i = 0; i < 3; i++) {
        d.completedContainer(clusterResource, app_0, node_0, rmContainer, null, RMContainerEventType.KILL, null, true);
    }
    verifyQueueMetrics(a, 1 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    //reset manually resources on node
    node_0 = TestUtils.getMockNode("host_0", DEFAULT_RACK, 0, (memoryPerNode - 1 - 2 - 3 - 1) * GB);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    // Assign 2 x 1GB Containers to A 
    for (int i = 0; i < 2; i++) {
        stubQueueAllocation(a, clusterResource, node_0, 1 * GB);
        stubQueueAllocation(b, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
        stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
        root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    }
    verifyQueueMetrics(a, 3 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    //Release 1GB Container from A
    a.completedContainer(clusterResource, app_0, node_0, rmContainer, null, RMContainerEventType.KILL, null, true);
    verifyQueueMetrics(a, 2 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    //reset manually resources on node
    node_0 = TestUtils.getMockNode("host_0", DEFAULT_RACK, 0, (memoryPerNode - 2 - 2 - 3 - 1) * GB);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    // Assign 1GB container to B 
    stubQueueAllocation(a, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(b, clusterResource, node_0, 1 * GB);
    stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
    root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    verifyQueueMetrics(a, 2 * GB, clusterResource);
    verifyQueueMetrics(b, 3 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    //Release 1GB container resources from B
    b.completedContainer(clusterResource, app_0, node_0, rmContainer, null, RMContainerEventType.KILL, null, true);
    verifyQueueMetrics(a, 2 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    //reset manually resources on node
    node_0 = TestUtils.getMockNode("host_0", DEFAULT_RACK, 0, (memoryPerNode - 2 - 2 - 3 - 1) * GB);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    // Assign 1GB container to A
    stubQueueAllocation(a, clusterResource, node_0, 1 * GB);
    stubQueueAllocation(b, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(d, clusterResource, node_0, 0 * GB);
    root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    verifyQueueMetrics(a, 3 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    verifyQueueMetrics(d, 1 * GB, clusterResource);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
    // Now do the real test, where B and D request a 1GB container
    // D should should get the next container if the order is correct
    stubQueueAllocation(a, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(b, clusterResource, node_0, 1 * GB);
    stubQueueAllocation(c, clusterResource, node_0, 0 * GB);
    stubQueueAllocation(d, clusterResource, node_0, 1 * GB);
    root.assignContainers(clusterResource, node_0, new ResourceLimits(clusterResource), SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    InOrder allocationOrder = inOrder(d, b);
    allocationOrder.verify(d).assignContainers(eq(clusterResource), any(PlacementSet.class), any(ResourceLimits.class), any(SchedulingMode.class));
    allocationOrder.verify(b).assignContainers(eq(clusterResource), any(PlacementSet.class), any(ResourceLimits.class), any(SchedulingMode.class));
    verifyQueueMetrics(a, 3 * GB, clusterResource);
    verifyQueueMetrics(b, 2 * GB, clusterResource);
    verifyQueueMetrics(c, 3 * GB, clusterResource);
    //D got the container
    verifyQueueMetrics(d, 2 * GB, clusterResource);
    LOG.info("status child-queues: " + ((ParentQueue) root).getChildQueuesToPrint());
}

Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) HashMap(java.util.HashMap) ContainerAllocationExpirer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) RMContainerImpl(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) SystemMetricsPublisher(org.apache.hadoop.yarn.server.resourcemanager.metrics.SystemMetricsPublisher) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) PlacementSet(org.apache.hadoop.yarn.server.resourcemanager.scheduler.placement.PlacementSet) RMContext(org.apache.hadoop.yarn.server.resourcemanager.RMContext) InOrder(org.mockito.InOrder) FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) Priority(org.apache.hadoop.yarn.api.records.Priority) Resource(org.apache.hadoop.yarn.api.records.Resource) RMApplicationHistoryWriter(org.apache.hadoop.yarn.server.resourcemanager.ahs.RMApplicationHistoryWriter) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) RMContainerEventType(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType) ResourceLimits(org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits) Test(org.junit.Test)

Example 2 with DrainDispatcher

use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.

the class TestNodeBlacklistingOnAMFailures method testNodeBlacklistingOnAMFailure.

@Test(timeout = 100000)
public void testNodeBlacklistingOnAMFailure() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED, true);
    DrainDispatcher dispatcher = new DrainDispatcher();
    MockRM rm = startRM(conf, dispatcher);
    CapacityScheduler scheduler = (CapacityScheduler) rm.getResourceScheduler();
    // Register 5 nodes, so that we can blacklist atleast one if AM container
    // is failed. As per calculation it will be like, 5nodes * 0.2 (default)=1.
    // First register 2 nodes, and after AM lauched register 3 more nodes.
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm.getResourceTrackerService());
    nm1.registerNode();
    MockNM nm2 = new MockNM("127.0.0.2:2345", 8000, rm.getResourceTrackerService());
    nm2.registerNode();
    RMApp app = rm.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app, rm, nm1);
    ContainerId amContainerId = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
    RMContainer rmContainer = scheduler.getRMContainer(amContainerId);
    NodeId nodeWhereAMRan = rmContainer.getAllocatedNode();
    MockNM currentNode, otherNode;
    if (nodeWhereAMRan.equals(nm1.getNodeId())) {
        currentNode = nm1;
        otherNode = nm2;
    } else {
        currentNode = nm2;
        otherNode = nm1;
    }
    // register 3 nodes now
    MockNM nm3 = new MockNM("127.0.0.3:2345", 8000, rm.getResourceTrackerService());
    nm3.registerNode();
    MockNM nm4 = new MockNM("127.0.0.4:2345", 8000, rm.getResourceTrackerService());
    nm4.registerNode();
    MockNM nm5 = new MockNM("127.0.0.5:2345", 8000, rm.getResourceTrackerService());
    nm5.registerNode();
    // Set the exist status to INVALID so that we can verify that the system
    // automatically blacklisting the node
    makeAMContainerExit(rm, amContainerId, currentNode, ContainerExitStatus.INVALID);
    // restart the am
    RMAppAttempt attempt = MockRM.waitForAttemptScheduled(app, rm);
    System.out.println("New AppAttempt launched " + attempt.getAppAttemptId());
    // Try the current node a few times
    for (int i = 0; i <= 2; i++) {
        currentNode.nodeHeartbeat(true);
        dispatcher.await();
        Assert.assertEquals("AppAttemptState should still be SCHEDULED if currentNode is " + "blacklisted correctly", RMAppAttemptState.SCHEDULED, attempt.getAppAttemptState());
    }
    // Now try the other node
    otherNode.nodeHeartbeat(true);
    dispatcher.await();
    // Now the AM container should be allocated
    MockRM.waitForState(attempt, RMAppAttemptState.ALLOCATED, 20000);
    MockAM am2 = rm.sendAMLaunched(attempt.getAppAttemptId());
    rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
    amContainerId = ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
    rmContainer = scheduler.getRMContainer(amContainerId);
    nodeWhereAMRan = rmContainer.getAllocatedNode();
    // The other node should now receive the assignment
    Assert.assertEquals("After blacklisting, AM should have run on the other node", otherNode.getNodeId(), nodeWhereAMRan);
    am2.registerAppAttempt();
    rm.waitForState(app.getApplicationId(), RMAppState.RUNNING);
    List<Container> allocatedContainers = TestAMRestart.allocateContainers(currentNode, am2, 1);
    Assert.assertEquals("Even though AM is blacklisted from the node, application can " + "still allocate non-AM containers there", currentNode.getNodeId(), allocatedContainers.get(0).getNodeId());
}

Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NodeId(org.apache.hadoop.yarn.api.records.NodeId) CapacityScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler) Test(org.junit.Test)

Example 3 with DrainDispatcher

use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.

the class TestApplicationCleanup method testContainerCleanupWhenRMRestartedAppNotRegistered.

@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testContainerCleanupWhenRMRestartedAppNotRegistered() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // start RM
    final DrainDispatcher dispatcher = new DrainDispatcher();
    MockRM rm1 = new MockRM(conf, memStore) {

        @Override
        protected Dispatcher createDispatcher() {
            return dispatcher;
        }
    };
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200);
    MockAM am0 = launchAM(app0, rm1, nm1);
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    rm1.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
    // start new RM
    final DrainDispatcher dispatcher2 = new DrainDispatcher();
    MockRM rm2 = new MockRM(conf, memStore) {

        @Override
        protected Dispatcher createDispatcher() {
            return dispatcher2;
        }
    };
    rm2.start();
    // nm1 register to rm2, and do a heartbeat
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    nm1.registerNode(Arrays.asList(app0.getApplicationId()));
    rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
    // Add unknown container for application unknown to scheduler
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    waitForContainerCleanup(dispatcher2, nm1, response);
    rm1.stop();
    rm2.stop();
}

Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 4 with DrainDispatcher

use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.

the class TestLogAggregationService method createDispatcher.

private DrainDispatcher createDispatcher() {
    DrainDispatcher dispatcher = new DrainDispatcher();
    dispatcher.init(this.conf);
    dispatcher.start();
    return dispatcher;
}

Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher)

Example 5 with DrainDispatcher

use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.

the class TestNonAggregatingLogHandler method createDispatcher.

private DrainDispatcher createDispatcher(Configuration conf) {
    DrainDispatcher dispatcher = new DrainDispatcher();
    dispatcher.init(conf);
    dispatcher.start();
    return dispatcher;
}

Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher)

Aggregations

DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)84 Test (org.junit.Test)73 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)54 Configuration (org.apache.hadoop.conf.Configuration)50 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)41 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)35 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)32 Path (org.apache.hadoop.fs.Path)24 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)24 Job (org.apache.hadoop.mapreduce.v2.app.job.Job)23 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)21 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)20 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)16 DeletionService (org.apache.hadoop.yarn.server.nodemanager.DeletionService)16 LocalizerEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizerEvent)15 HashMap (java.util.HashMap)14 LocalDirsHandlerService (org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService)14 LocalResourceVisibility (org.apache.hadoop.yarn.api.records.LocalResourceVisibility)12 Container (org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container)12