Search in sources :

Example 41 with RMNode

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode in project hadoop by apache.

the class TestNodeLabelContainerAllocation method testAMContainerAllocationWillAlwaysBeExclusive.

@Test
public void testAMContainerAllocationWillAlwaysBeExclusive() throws Exception {
    /**
     * Test case: Submit one application without partition, trying to allocate a
     * node has partition=x, it should fail to allocate since AM container will
     * always respect exclusivity for partitions
     */
    // set node -> label
    mgr.addToCluserNodeLabels(ImmutableSet.of(NodeLabel.newInstance("x", false), NodeLabel.newInstance("y")));
    mgr.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("x")));
    // inject node label manager
    MockRM rm1 = new MockRM(TestUtils.getConfigurationWithQueueLabels(conf)) {

        @Override
        public RMNodeLabelsManager createNodeLabelManager() {
            return mgr;
        }
    };
    rm1.getRMContext().setNodeLabelManager(mgr);
    rm1.start();
    String nodeIdStr = "h1:1234";
    // label = x
    MockNM nm1 = rm1.registerNode(nodeIdStr, 8 * GB);
    // launch an app to queue b1 (label = y), AM container should be launched in nm3
    RMApp app = rm1.submitApp(1 * GB, "app", "user", null, "b1");
    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
    // Heartbeat for many times, app1 should get nothing
    for (int i = 0; i < 50; i++) {
        cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
    }
    Assert.assertTrue("Scheduler diagnostics should have reason for not assigning the node", app.getDiagnostics().toString().contains(CSAMContainerLaunchDiagnosticsConstants.SKIP_AM_ALLOCATION_IN_IGNORE_EXCLUSIVE_MODE));
    Assert.assertTrue("Scheduler diagnostics should have last processed node information", app.getDiagnostics().toString().contains(CSAMContainerLaunchDiagnosticsConstants.LAST_NODE_PROCESSED_MSG + nodeIdStr + " ( Partition : [x]"));
    Assert.assertEquals(0, cs.getSchedulerNode(nm1.getNodeId()).getNumContainers());
    rm1.close();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) Test(org.junit.Test)

Example 42 with RMNode

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode in project hadoop by apache.

the class TestNodeLabelContainerAllocation method testPreferenceOfNeedyAppsTowardsNodePartitions.

@Test
public void testPreferenceOfNeedyAppsTowardsNodePartitions() throws Exception {
    /**
     * Test case: Submit two application to a queue (app1 first then app2), app1
     * asked for no-label, app2 asked for label=x, when node1 has label=x
     * doing heart beat, app2 will get allocation first, even if app2 submits later
     * than app1
     */
    // set node -> label
    mgr.addToCluserNodeLabels(ImmutableSet.of(NodeLabel.newInstance("x"), NodeLabel.newInstance("y", false)));
    mgr.addLabelsToNode(ImmutableMap.of(NodeId.newInstance("h1", 0), toSet("y")));
    // inject node label manager
    MockRM rm1 = new MockRM(TestUtils.getConfigurationWithQueueLabels(conf)) {

        @Override
        public RMNodeLabelsManager createNodeLabelManager() {
            return mgr;
        }
    };
    rm1.getRMContext().setNodeLabelManager(mgr);
    rm1.start();
    // label = y
    MockNM nm1 = rm1.registerNode("h1:1234", 8 * GB);
    // label = <empty>
    MockNM nm2 = rm1.registerNode("h2:1234", 100 * GB);
    // launch an app to queue b1 (label = y), AM container should be launched in nm2
    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "b1");
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm2);
    // launch another app to queue b1 (label = y), AM container should be launched in nm2
    RMApp app2 = rm1.submitApp(1 * GB, "app", "user", null, "b1");
    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
    // request container and nm1 do heartbeat (nm2 has label=y), note that app1
    // request non-labeled container, and app2 request labeled container, app2
    // will get allocated first even if app1 submitted first.  
    am1.allocate("*", 1 * GB, 8, new ArrayList<ContainerId>());
    am2.allocate("*", 1 * GB, 8, new ArrayList<ContainerId>(), "y");
    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
    // Do node heartbeats many times
    for (int i = 0; i < 50; i++) {
        cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
        cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
    }
    // App2 will get preference to be allocated on node1, and node1 will be all
    // used by App2.
    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(am1.getApplicationAttemptId());
    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(am2.getApplicationAttemptId());
    // app1 get nothing in nm1 (partition=y)
    checkNumOfContainersInAnAppOnGivenNode(0, nm1.getNodeId(), schedulerApp1);
    checkNumOfContainersInAnAppOnGivenNode(9, nm2.getNodeId(), schedulerApp1);
    // app2 get all resource in nm1 (partition=y)
    checkNumOfContainersInAnAppOnGivenNode(8, nm1.getNodeId(), schedulerApp2);
    checkNumOfContainersInAnAppOnGivenNode(1, nm2.getNodeId(), schedulerApp2);
    rm1.close();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) Test(org.junit.Test)

Example 43 with RMNode

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode in project hadoop by apache.

the class TestMiniYarnClusterNodeUtilization method verifySimulatedUtilization.

/**
   * Verify both the RMNode and SchedulerNode have been updated with the test
   * fixture utilization data.
   */
private void verifySimulatedUtilization() throws InterruptedException {
    ResourceManager rm = cluster.getResourceManager(0);
    RMContext rmContext = rm.getRMContext();
    ResourceUtilization containersUtilization = nodeStatus.getContainersUtilization();
    ResourceUtilization nodeUtilization = nodeStatus.getNodeUtilization();
    // We check if the nodeUtilization is up to date
    for (int i = 0; i < 100; i++) {
        for (RMNode ni : rmContext.getRMNodes().values()) {
            if (ni.getNodeUtilization() != null) {
                if (ni.getNodeUtilization().equals(nodeUtilization)) {
                    break;
                }
            }
        }
        Thread.sleep(100);
    }
    // Verify the data is readable from the RM and scheduler nodes
    for (RMNode ni : rmContext.getRMNodes().values()) {
        ResourceUtilization cu = ni.getAggregatedContainersUtilization();
        assertEquals("Containers Utillization not propagated to RMNode", containersUtilization, cu);
        ResourceUtilization nu = ni.getNodeUtilization();
        assertEquals("Node Utillization not propagated to RMNode", nodeUtilization, nu);
        SchedulerNode scheduler = rmContext.getScheduler().getSchedulerNode(ni.getNodeID());
        cu = scheduler.getAggregatedContainersUtilization();
        assertEquals("Containers Utillization not propagated to SchedulerNode", containersUtilization, cu);
        nu = scheduler.getNodeUtilization();
        assertEquals("Node Utillization not propagated to SchedulerNode", nodeUtilization, nu);
    }
}
Also used : RMContext(org.apache.hadoop.yarn.server.resourcemanager.RMContext) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) SchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode) ResourceUtilization(org.apache.hadoop.yarn.api.records.ResourceUtilization) ResourceManager(org.apache.hadoop.yarn.server.resourcemanager.ResourceManager)

Example 44 with RMNode

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode in project hadoop by apache.

the class ResourceTrackerService method unRegisterNodeManager.

@SuppressWarnings("unchecked")
@Override
public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException {
    UnRegisterNodeManagerResponse response = recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class);
    NodeId nodeId = request.getNodeId();
    RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
    if (rmNode == null) {
        LOG.info("Node not found, ignoring the unregister from node id : " + nodeId);
        return response;
    }
    LOG.info("Node with node id : " + nodeId + " has shutdown, hence unregistering the node.");
    this.nmLivelinessMonitor.unregister(nodeId);
    this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeEvent(nodeId, RMNodeEventType.SHUTDOWN));
    return response;
}
Also used : RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) NodeId(org.apache.hadoop.yarn.api.records.NodeId) UnRegisterNodeManagerResponse(org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse) RMNodeEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent)

Example 45 with RMNode

use of org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode in project hadoop by apache.

the class TestOpportunisticContainerAllocatorAMService method testContainerPromoteAndDemoteBeforeContainerStart.

@Test(timeout = 600000)
public void testContainerPromoteAndDemoteBeforeContainerStart() throws Exception {
    HashMap<NodeId, MockNM> nodes = new HashMap<>();
    MockNM nm1 = new MockNM("h1:1234", 4096, rm.getResourceTrackerService());
    nodes.put(nm1.getNodeId(), nm1);
    MockNM nm2 = new MockNM("h1:4321", 4096, rm.getResourceTrackerService());
    nodes.put(nm2.getNodeId(), nm2);
    MockNM nm3 = new MockNM("h2:1234", 4096, rm.getResourceTrackerService());
    nodes.put(nm3.getNodeId(), nm3);
    MockNM nm4 = new MockNM("h2:4321", 4096, rm.getResourceTrackerService());
    nodes.put(nm4.getNodeId(), nm4);
    nm1.registerNode();
    nm2.registerNode();
    nm3.registerNode();
    nm4.registerNode();
    OpportunisticContainerAllocatorAMService amservice = (OpportunisticContainerAllocatorAMService) rm.getApplicationMasterService();
    RMApp app1 = rm.submitApp(1 * GB, "app", "user", null, "default");
    ApplicationAttemptId attemptId = app1.getCurrentAppAttempt().getAppAttemptId();
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm, nm2);
    ResourceScheduler scheduler = rm.getResourceScheduler();
    RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
    RMNode rmNode3 = rm.getRMContext().getRMNodes().get(nm3.getNodeId());
    RMNode rmNode4 = rm.getRMContext().getRMNodes().get(nm4.getNodeId());
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    ((RMNodeImpl) rmNode1).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode2).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode3).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    ((RMNodeImpl) rmNode4).setOpportunisticContainersStatus(getOppurtunisticStatus(-1, 100));
    OpportunisticContainerContext ctxt = ((CapacityScheduler) scheduler).getApplicationAttempt(attemptId).getOpportunisticContainerContext();
    // Send add and update node events to AM Service.
    amservice.handle(new NodeAddedSchedulerEvent(rmNode1));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode2));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode3));
    amservice.handle(new NodeAddedSchedulerEvent(rmNode4));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode1));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode2));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode3));
    amservice.handle(new NodeUpdateSchedulerEvent(rmNode4));
    // All nodes 1 - 4 will be applicable for scheduling.
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    Thread.sleep(1000);
    QueueMetrics metrics = ((CapacityScheduler) scheduler).getRootQueue().getMetrics();
    // Verify Metrics
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    AllocateResponse allocateResponse = am1.allocate(Arrays.asList(ResourceRequest.newInstance(Priority.newInstance(1), "*", Resources.createResource(1 * GB), 2, true, null, ExecutionTypeRequest.newInstance(ExecutionType.OPPORTUNISTIC, true))), null);
    List<Container> allocatedContainers = allocateResponse.getAllocatedContainers();
    Assert.assertEquals(2, allocatedContainers.size());
    Container container = allocatedContainers.get(0);
    MockNM allocNode = nodes.get(container.getNodeId());
    MockNM sameHostDiffNode = null;
    for (NodeId n : nodes.keySet()) {
        if (n.getHost().equals(allocNode.getNodeId().getHost()) && n.getPort() != allocNode.getNodeId().getPort()) {
            sameHostDiffNode = nodes.get(n);
        }
    }
    // Verify Metrics After OPP allocation (Nothing should change)
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(0, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    // Node on same host should not result in allocation
    sameHostDiffNode.nodeHeartbeat(true);
    Thread.sleep(200);
    allocateResponse = am1.allocate(new ArrayList<>(), new ArrayList<>());
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    // Verify Metrics After OPP allocation (Nothing should change again)
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
    // Send Promotion req again... this should result in update error
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(0, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    Assert.assertEquals(1, allocateResponse.getUpdateErrors().size());
    Assert.assertEquals("UPDATE_OUTSTANDING_ERROR", allocateResponse.getUpdateErrors().get(0).getReason());
    Assert.assertEquals(container.getId(), allocateResponse.getUpdateErrors().get(0).getUpdateContainerRequest().getContainerId());
    // Send Promotion req again with incorrect version...
    // this should also result in update error
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(1, container.getId(), ContainerUpdateType.PROMOTE_EXECUTION_TYPE, null, ExecutionType.GUARANTEED)));
    Assert.assertEquals(0, allocateResponse.getUpdatedContainers().size());
    Assert.assertEquals(1, allocateResponse.getUpdateErrors().size());
    Assert.assertEquals("INCORRECT_CONTAINER_VERSION_ERROR", allocateResponse.getUpdateErrors().get(0).getReason());
    Assert.assertEquals(0, allocateResponse.getUpdateErrors().get(0).getCurrentContainerVersion());
    Assert.assertEquals(container.getId(), allocateResponse.getUpdateErrors().get(0).getUpdateContainerRequest().getContainerId());
    // Ensure after correct node heartbeats, we should get the allocation
    allocNode.nodeHeartbeat(true);
    Thread.sleep(200);
    allocateResponse = am1.allocate(new ArrayList<>(), new ArrayList<>());
    Assert.assertEquals(1, allocateResponse.getUpdatedContainers().size());
    Container uc = allocateResponse.getUpdatedContainers().get(0).getContainer();
    Assert.assertEquals(ExecutionType.GUARANTEED, uc.getExecutionType());
    Assert.assertEquals(uc.getId(), container.getId());
    Assert.assertEquals(uc.getVersion(), container.getVersion() + 1);
    // Verify Metrics After OPP allocation :
    // Allocated cores+mem should have increased, available should decrease
    verifyMetrics(metrics, 14336, 14, 2048, 2, 2);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    nm4.nodeHeartbeat(true);
    Thread.sleep(200);
    // Verify that the container is still in ACQUIRED state wrt the RM.
    RMContainer rmContainer = ((CapacityScheduler) scheduler).getApplicationAttempt(uc.getId().getApplicationAttemptId()).getRMContainer(uc.getId());
    Assert.assertEquals(RMContainerState.ACQUIRED, rmContainer.getState());
    // Now demote the container back..
    allocateResponse = am1.sendContainerUpdateRequest(Arrays.asList(UpdateContainerRequest.newInstance(uc.getVersion(), uc.getId(), ContainerUpdateType.DEMOTE_EXECUTION_TYPE, null, ExecutionType.OPPORTUNISTIC)));
    // This should happen in the same heartbeat..
    Assert.assertEquals(1, allocateResponse.getUpdatedContainers().size());
    uc = allocateResponse.getUpdatedContainers().get(0).getContainer();
    Assert.assertEquals(ExecutionType.OPPORTUNISTIC, uc.getExecutionType());
    Assert.assertEquals(uc.getId(), container.getId());
    Assert.assertEquals(uc.getVersion(), container.getVersion() + 2);
    // Verify Metrics After OPP allocation :
    // Everything should have reverted to what it was
    verifyMetrics(metrics, 15360, 15, 1024, 1, 1);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeUpdateSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent) NodeAddedSchedulerEvent(org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) DistributedSchedulingAllocateResponse(org.apache.hadoop.yarn.server.api.protocolrecords.DistributedSchedulingAllocateResponse) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) NodeId(org.apache.hadoop.yarn.api.records.NodeId) ResourceScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler) OpportunisticContainerContext(org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerContext) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) Test(org.junit.Test)

Aggregations

RMNode (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode)186 Test (org.junit.Test)143 NodeUpdateSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent)103 NodeAddedSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent)94 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)63 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)55 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)46 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)44 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)37 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)35 NodeId (org.apache.hadoop.yarn.api.records.NodeId)30 FiCaSchedulerApp (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp)30 Resource (org.apache.hadoop.yarn.api.records.Resource)28 FileWriter (java.io.FileWriter)24 PrintWriter (java.io.PrintWriter)24 NodeRemovedSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent)23 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)22 ArrayList (java.util.ArrayList)20 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)20 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)19