Search in sources :

Example 36 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeHeartbeatForAppCollectorsMap.

@Test
public void testNodeHeartbeatForAppCollectorsMap() throws Exception {
    Configuration conf = new Configuration();
    conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
    // set version to 2
    conf.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
    // enable aux-service based timeline collectors
    conf.set(YarnConfiguration.NM_AUX_SERVICES, "timeline_collector");
    conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + "timeline_collector" + ".class", PerNodeTimelineCollectorsAuxService.class.getName());
    conf.setClass(YarnConfiguration.TIMELINE_SERVICE_WRITER_CLASS, FileSystemTimelineWriterImpl.class, TimelineWriter.class);
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:1234", 2048);
    NodeHeartbeatResponse nodeHeartbeat1 = nm1.nodeHeartbeat(true);
    NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true);
    RMNodeImpl node1 = (RMNodeImpl) rm.getRMContext().getRMNodes().get(nm1.getNodeId());
    RMNodeImpl node2 = (RMNodeImpl) rm.getRMContext().getRMNodes().get(nm2.getNodeId());
    RMApp app1 = rm.submitApp(1024);
    String collectorAddr1 = "1.2.3.4:5";
    app1.setCollectorAddr(collectorAddr1);
    String collectorAddr2 = "5.4.3.2:1";
    RMApp app2 = rm.submitApp(1024);
    app2.setCollectorAddr(collectorAddr2);
    // Create a running container for app1 running on nm1
    ContainerId runningContainerId1 = BuilderUtils.newContainerId(BuilderUtils.newApplicationAttemptId(app1.getApplicationId(), 0), 0);
    ContainerStatus status1 = ContainerStatus.newInstance(runningContainerId1, ContainerState.RUNNING, "", 0);
    List<ContainerStatus> statusList = new ArrayList<ContainerStatus>();
    statusList.add(status1);
    NodeHealthStatus nodeHealth = NodeHealthStatus.newInstance(true, "", System.currentTimeMillis());
    NodeStatus nodeStatus = NodeStatus.newInstance(nm1.getNodeId(), 0, statusList, null, nodeHealth, null, null, null);
    node1.handle(new RMNodeStatusEvent(nm1.getNodeId(), nodeStatus, nodeHeartbeat1));
    Assert.assertEquals(1, node1.getRunningApps().size());
    Assert.assertEquals(app1.getApplicationId(), node1.getRunningApps().get(0));
    // Create a running container for app2 running on nm2
    ContainerId runningContainerId2 = BuilderUtils.newContainerId(BuilderUtils.newApplicationAttemptId(app2.getApplicationId(), 0), 0);
    ContainerStatus status2 = ContainerStatus.newInstance(runningContainerId2, ContainerState.RUNNING, "", 0);
    statusList = new ArrayList<ContainerStatus>();
    statusList.add(status2);
    nodeStatus = NodeStatus.newInstance(nm1.getNodeId(), 0, statusList, null, nodeHealth, null, null, null);
    node2.handle(new RMNodeStatusEvent(nm2.getNodeId(), nodeStatus, nodeHeartbeat2));
    Assert.assertEquals(1, node2.getRunningApps().size());
    Assert.assertEquals(app2.getApplicationId(), node2.getRunningApps().get(0));
    nodeHeartbeat1 = nm1.nodeHeartbeat(true);
    Map<ApplicationId, String> map1 = nodeHeartbeat1.getAppCollectorsMap();
    Assert.assertEquals(1, map1.size());
    Assert.assertEquals(collectorAddr1, map1.get(app1.getApplicationId()));
    nodeHeartbeat2 = nm2.nodeHeartbeat(true);
    Map<ApplicationId, String> map2 = nodeHeartbeat2.getAppCollectorsMap();
    Assert.assertEquals(1, map2.size());
    Assert.assertEquals(collectorAddr2, map2.get(app2.getApplicationId()));
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) RMNodeStatusEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent) ArrayList(java.util.ArrayList) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) PerNodeTimelineCollectorsAuxService(org.apache.hadoop.yarn.server.timelineservice.collector.PerNodeTimelineCollectorsAuxService) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) NodeStatus(org.apache.hadoop.yarn.server.api.records.NodeStatus) NodeHealthStatus(org.apache.hadoop.yarn.server.api.records.NodeHealthStatus) Test(org.junit.Test)

Example 37 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testGetNextHeartBeatInterval.

/**
   * Test RM read NM next heartBeat Interval correctly from Configuration file,
   * and NM get next heartBeat Interval from RM correctly
   */
@Test(timeout = 50000)
public void testGetNextHeartBeatInterval() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, "4000");
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals(4000, nodeHeartbeat.getNextHeartBeatInterval());
    NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true);
    Assert.assertEquals(4000, nodeHeartbeat2.getNextHeartBeatInterval());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 38 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testAddNewExcludePathToConfiguration.

/**
   * Decommissioning using a post-configured exclude hosts file
   */
@Test
public void testAddNewExcludePathToConfiguration() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    ClusterMetrics metrics = ClusterMetrics.getMetrics();
    assert (metrics != null);
    int initialMetricCount = metrics.getNumDecommisionedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    writeToHostsFile("host2");
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    rm.getNodesListManager().refreshNodes(conf);
    checkDecommissionedNMCount(rm, ++initialMetricCount);
    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals("Node should not have been decomissioned.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertEquals("Node should have been decomissioned but is in state" + nodeHeartbeat.getNodeAction(), NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 39 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testGracefulDecommissionWithApp.

/**
   * Graceful decommission node with running application.
   */
@Test
public void testGracefulDecommissionWithApp() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    writeToHostsFile("");
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 10240);
    MockNM nm2 = rm.registerNode("host2:5678", 20480);
    MockNM nm3 = rm.registerNode("host3:4433", 10240);
    NodeId id1 = nm1.getNodeId();
    NodeId id3 = nm3.getNodeId();
    rm.waitForState(id1, NodeState.RUNNING);
    rm.waitForState(id3, NodeState.RUNNING);
    // Create an app and launch two containers on host1.
    RMApp app = rm.submitApp(2000);
    MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1);
    ApplicationAttemptId aaid = app.getCurrentAppAttempt().getAppAttemptId();
    nm1.nodeHeartbeat(aaid, 2, ContainerState.RUNNING);
    nm3.nodeHeartbeat(true);
    // Graceful decommission host1 and host3
    writeToHostsFile("host1", "host3");
    rm.getNodesListManager().refreshNodes(conf, true);
    rm.waitForState(id1, NodeState.DECOMMISSIONING);
    rm.waitForState(id3, NodeState.DECOMMISSIONING);
    // host1 should be DECOMMISSIONING due to running containers.
    // host3 should become DECOMMISSIONED.
    nm1.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    rm.waitForState(id1, NodeState.DECOMMISSIONING);
    rm.waitForState(id3, NodeState.DECOMMISSIONED);
    nm1.nodeHeartbeat(aaid, 2, ContainerState.RUNNING);
    // Complete containers on host1.
    // Since the app is still RUNNING, expect NodeAction.NORMAL.
    NodeHeartbeatResponse nodeHeartbeat1 = nm1.nodeHeartbeat(aaid, 2, ContainerState.COMPLETE);
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat1.getNodeAction());
    // Finish the app and verified DECOMMISSIONED.
    MockRM.finishAMAndVerifyAppState(app, rm, nm1, am);
    rm.waitForState(app.getApplicationId(), RMAppState.FINISHED);
    nodeHeartbeat1 = nm1.nodeHeartbeat(aaid, 2, ContainerState.COMPLETE);
    Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat1.getNodeAction());
    rm.waitForState(id1, NodeState.DECOMMISSIONED);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) NodeId(org.apache.hadoop.yarn.api.records.NodeId) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) Test(org.junit.Test)

Example 40 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeRemovalUtilUnhealthy.

private void testNodeRemovalUtilUnhealthy(boolean doGraceful) throws Exception {
    Configuration conf = new Configuration();
    int timeoutValue = 500;
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath());
    writeToHostsFile(hostFile, "host1", "localhost", "host2");
    writeToHostsFile(excludeHostFile, "");
    conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, timeoutValue);
    rm = new MockRM(conf);
    rm.init(conf);
    rm.start();
    RMContext rmContext = rm.getRMContext();
    refreshNodesOption(doGraceful, conf);
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);
    ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
    ClusterMetrics metrics = clusterMetrics;
    assert (metrics != null);
    rm.drainEvents();
    //check all 3 nodes joined in as NORMAL
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    rm.drainEvents();
    Assert.assertEquals("All 3 nodes should be active", metrics.getNumActiveNMs(), 3);
    // node healthy
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(false);
    nm3.nodeHeartbeat(true);
    checkUnhealthyNMCount(rm, nm2, true, 1);
    writeToHostsFile(hostFile, "host1", "localhost");
    writeToHostsFile(excludeHostFile, "");
    refreshNodesOption(doGraceful, conf);
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(false);
    nm3.nodeHeartbeat(true);
    rm.drainEvents();
    if (!doGraceful) {
        Assert.assertNotEquals("host2 should be a shutdown NM!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null);
        Assert.assertEquals("host2 should be a shutdown NM!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()).getState(), NodeState.SHUTDOWN);
    }
    Assert.assertEquals("There should be 2 Active NM!", clusterMetrics.getNumActiveNMs(), 2);
    if (!doGraceful) {
        Assert.assertEquals("There should be 1 Shutdown NM!", clusterMetrics.getNumShutdownNMs(), 1);
    }
    Assert.assertEquals("There should be 0 Unhealthy NM!", clusterMetrics.getUnhealthyNMs(), 0);
    int nodeRemovalTimeout = conf.getInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, YarnConfiguration.DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC);
    int nodeRemovalInterval = rmContext.getNodesListManager().getNodeRemovalCheckInterval();
    long maxThreadSleeptime = nodeRemovalInterval + nodeRemovalTimeout;
    int waitCount = 0;
    while (rmContext.getInactiveRMNodes().get(nm2.getNodeId()) != null && waitCount++ < 2) {
        synchronized (this) {
            wait(maxThreadSleeptime);
        }
    }
    Assert.assertEquals("host2 should have been forgotten!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null);
    Assert.assertEquals("There should be no Shutdown NMs!", clusterMetrics.getNumRebootedNMs(), 0);
    Assert.assertEquals("There should be 2 Active NM!", clusterMetrics.getNumActiveNMs(), 2);
    rm.stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)

Aggregations

NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)49 Test (org.junit.Test)33 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)23 Configuration (org.apache.hadoop.conf.Configuration)21 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)16 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)13 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)12 ArrayList (java.util.ArrayList)10 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)9 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)8 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)7 NodeHeartbeatRequest (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest)7 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)7 RMNodeStatusEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent)7 Container (org.apache.hadoop.yarn.api.records.Container)6 Resource (org.apache.hadoop.yarn.api.records.Resource)6 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)6 NodeHealthStatus (org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)6 ByteBuffer (java.nio.ByteBuffer)5 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)5