Search in sources :

Example 31 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testAddNewIncludePathToConfiguration.

/**
  * Decommissioning using a post-configured include hosts file
  */
@Test
public void testAddNewIncludePathToConfiguration() throws Exception {
    Configuration conf = new Configuration();
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    ClusterMetrics metrics = ClusterMetrics.getMetrics();
    assert (metrics != null);
    int initialMetricCount = metrics.getNumShutdownNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    writeToHostsFile("host1");
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    rm.getNodesListManager().refreshNodes(conf);
    checkShutdownNMCount(rm, ++initialMetricCount);
    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertEquals("Node should not have been shutdown.", NodeAction.NORMAL, nodeHeartbeat.getNodeAction());
    NodeState nodeState = rm.getRMContext().getInactiveRMNodes().get(nm2.getNodeId()).getState();
    Assert.assertEquals("Node should have been shutdown but is in state" + nodeState, NodeState.SHUTDOWN, nodeState);
}
Also used : NodeState(org.apache.hadoop.yarn.api.records.NodeState) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 32 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testDecommissionWithExcludeHosts.

/**
   * Decommissioning using a pre-configured exclude hosts file
   */
@Test
public void testDecommissionWithExcludeHosts() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    writeToHostsFile("");
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);
    int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    rm.drainEvents();
    // To test that IPs also work
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host2", ip);
    rm.getNodesListManager().refreshNodes(conf);
    checkDecommissionedNMCount(rm, metricCount + 2);
    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    rm.drainEvents();
    writeToHostsFile("");
    rm.getNodesListManager().refreshNodes(conf);
    nm3 = rm.registerNode("localhost:4433", 1024);
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    // decommissined node is 1 since 1 node is rejoined after updating exclude
    // file
    checkDecommissionedNMCount(rm, metricCount + 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 33 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testDecommissionWithIncludeHosts.

/**
   * Decommissioning using a pre-configured include hosts file
   */
@Test
public void testDecommissionWithIncludeHosts() throws Exception {
    writeToHostsFile("localhost", "host1", "host2");
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    rm = new MockRM(conf);
    rm.start();
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);
    ClusterMetrics metrics = ClusterMetrics.getMetrics();
    assert (metrics != null);
    int metricCount = metrics.getNumDecommisionedNMs();
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    // To test that IPs also work
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host1", ip);
    rm.getNodesListManager().refreshNodes(conf);
    checkShutdownNMCount(rm, ++metricCount);
    nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals(1, ClusterMetrics.getMetrics().getNumShutdownNMs());
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue("Node is not decommisioned.", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals(metricCount, ClusterMetrics.getMetrics().getNumShutdownNMs());
    rm.stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 34 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeRemovalUtil.

public void testNodeRemovalUtil(boolean doGraceful) throws Exception {
    Configuration conf = new Configuration();
    int timeoutValue = 500;
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, "");
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, "");
    conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, timeoutValue);
    CountDownLatch latch = new CountDownLatch(1);
    rm = new MockRM(conf);
    rm.init(conf);
    rm.start();
    RMContext rmContext = rm.getRMContext();
    refreshNodesOption(doGraceful, conf);
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);
    ClusterMetrics metrics = ClusterMetrics.getMetrics();
    assert (metrics != null);
    //check all 3 nodes joined in as NORMAL
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    rm.drainEvents();
    Assert.assertEquals("All 3 nodes should be active", metrics.getNumActiveNMs(), 3);
    //Remove nm2 from include list, should now be shutdown with timer test
    String ip = NetUtils.normalizeHostName("localhost");
    writeToHostsFile("host1", ip);
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    refreshNodesOption(doGraceful, conf);
    if (doGraceful) {
        rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONING);
    }
    nm1.nodeHeartbeat(true);
    nm2.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertTrue("Node should not be in active node list", !rmContext.getRMNodes().containsKey(nm2.getNodeId()));
    RMNode rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId());
    Assert.assertEquals("Node should be in inactive node list", rmNode.getState(), doGraceful ? NodeState.DECOMMISSIONED : NodeState.SHUTDOWN);
    Assert.assertEquals("Active nodes should be 2", metrics.getNumActiveNMs(), 2);
    Assert.assertEquals("Shutdown nodes should be expected", metrics.getNumShutdownNMs(), doGraceful ? 0 : 1);
    int nodeRemovalTimeout = conf.getInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, YarnConfiguration.DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC);
    int nodeRemovalInterval = rmContext.getNodesListManager().getNodeRemovalCheckInterval();
    long maxThreadSleeptime = nodeRemovalInterval + nodeRemovalTimeout;
    latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS);
    rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId());
    Assert.assertEquals("Node should have been forgotten!", rmNode, null);
    Assert.assertEquals("Shutdown nodes should be 0 now", metrics.getNumShutdownNMs(), 0);
    //Check node removal and re-addition before timer expires
    writeToHostsFile("host1", ip, "host2");
    refreshNodesOption(doGraceful, conf);
    nm2 = rm.registerNode("host2:5678", 10240);
    rm.drainEvents();
    writeToHostsFile("host1", ip);
    refreshNodesOption(doGraceful, conf);
    rm.waitForState(nm2.getNodeId(), doGraceful ? NodeState.DECOMMISSIONING : NodeState.SHUTDOWN);
    nm2.nodeHeartbeat(true);
    rm.drainEvents();
    rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId());
    Assert.assertEquals("Node should be shutdown", rmNode.getState(), doGraceful ? NodeState.DECOMMISSIONED : NodeState.SHUTDOWN);
    Assert.assertEquals("Active nodes should be 2", metrics.getNumActiveNMs(), 2);
    Assert.assertEquals("Shutdown nodes should be expected", metrics.getNumShutdownNMs(), doGraceful ? 0 : 1);
    //add back the node before timer expires
    latch.await(maxThreadSleeptime - 2000, TimeUnit.MILLISECONDS);
    writeToHostsFile("host1", ip, "host2");
    refreshNodesOption(doGraceful, conf);
    nm2 = rm.registerNode("host2:5678", 10240);
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    rm.drainEvents();
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    Assert.assertEquals("Shutdown nodes should be 0 now", metrics.getNumShutdownNMs(), 0);
    Assert.assertEquals("All 3 nodes should be active", metrics.getNumActiveNMs(), 3);
    //Decommission this node, check timer doesn't remove it
    writeToHostsFile("host1", "host2", ip);
    writeToHostsFile(excludeHostFile, "host2");
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath());
    refreshNodesOption(doGraceful, conf);
    rm.drainEvents();
    rmNode = doGraceful ? rmContext.getRMNodes().get(nm2.getNodeId()) : rmContext.getInactiveRMNodes().get(nm2.getNodeId());
    Assert.assertTrue("Node should be DECOMMISSIONED or DECOMMISSIONING", (rmNode.getState() == NodeState.DECOMMISSIONED) || (rmNode.getState() == NodeState.DECOMMISSIONING));
    if (rmNode.getState() == NodeState.DECOMMISSIONED) {
        Assert.assertEquals("Decommissioned/ing nodes should be 1 now", metrics.getNumDecommisionedNMs(), 1);
    }
    latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS);
    rmNode = doGraceful ? rmContext.getRMNodes().get(nm2.getNodeId()) : rmContext.getInactiveRMNodes().get(nm2.getNodeId());
    Assert.assertTrue("Node should be DECOMMISSIONED or DECOMMISSIONING", (rmNode.getState() == NodeState.DECOMMISSIONED) || (rmNode.getState() == NodeState.DECOMMISSIONING));
    if (rmNode.getState() == NodeState.DECOMMISSIONED) {
        Assert.assertEquals("Decommissioned/ing nodes should be 1 now", metrics.getNumDecommisionedNMs(), 1);
    }
    //Test decommed/ing node that transitions to untracked,timer should remove
    testNodeRemovalUtilDecomToUntracked(rmContext, conf, nm1, nm2, nm3, maxThreadSleeptime, doGraceful);
    rm.stop();
}
Also used : RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) CountDownLatch(java.util.concurrent.CountDownLatch)

Example 35 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeRemovalUtilLost.

private void testNodeRemovalUtilLost(boolean doGraceful) throws Exception {
    Configuration conf = new Configuration();
    conf.setLong(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, 2000);
    int timeoutValue = 500;
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath());
    writeToHostsFile(hostFile, "host1", "localhost", "host2");
    writeToHostsFile(excludeHostFile, "");
    conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, timeoutValue);
    rm = new MockRM(conf);
    rm.init(conf);
    rm.start();
    RMContext rmContext = rm.getRMContext();
    refreshNodesOption(doGraceful, conf);
    MockNM nm1 = rm.registerNode("host1:1234", 5120);
    MockNM nm2 = rm.registerNode("host2:5678", 10240);
    MockNM nm3 = rm.registerNode("localhost:4433", 1024);
    ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
    ClusterMetrics metrics = clusterMetrics;
    assert (metrics != null);
    rm.drainEvents();
    //check all 3 nodes joined in as NORMAL
    NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm2.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    nodeHeartbeat = nm3.nodeHeartbeat(true);
    Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
    rm.drainEvents();
    Assert.assertEquals("All 3 nodes should be active", metrics.getNumActiveNMs(), 3);
    int waitCount = 0;
    while (waitCount++ < 20) {
        synchronized (this) {
            wait(200);
        }
        nm3.nodeHeartbeat(true);
        nm1.nodeHeartbeat(true);
    }
    Assert.assertNotEquals("host2 should be a lost NM!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null);
    Assert.assertEquals("host2 should be a lost NM!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()).getState(), NodeState.LOST);
    Assert.assertEquals("There should be 1 Lost NM!", clusterMetrics.getNumLostNMs(), 1);
    Assert.assertEquals("There should be 2 Active NM!", clusterMetrics.getNumActiveNMs(), 2);
    int nodeRemovalTimeout = conf.getInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, YarnConfiguration.DEFAULT_RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC);
    int nodeRemovalInterval = rmContext.getNodesListManager().getNodeRemovalCheckInterval();
    long maxThreadSleeptime = nodeRemovalInterval + nodeRemovalTimeout;
    writeToHostsFile(hostFile, "host1", "localhost");
    writeToHostsFile(excludeHostFile, "");
    refreshNodesOption(doGraceful, conf);
    nm1.nodeHeartbeat(true);
    nm3.nodeHeartbeat(true);
    rm.drainEvents();
    waitCount = 0;
    while (rmContext.getInactiveRMNodes().get(nm2.getNodeId()) != null && waitCount++ < 2) {
        synchronized (this) {
            wait(maxThreadSleeptime);
            nm1.nodeHeartbeat(true);
            nm2.nodeHeartbeat(true);
        }
    }
    Assert.assertEquals("host2 should have been forgotten!", rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null);
    Assert.assertEquals("There should be no Lost NMs!", clusterMetrics.getNumLostNMs(), 0);
    Assert.assertEquals("There should be 2 Active NM!", clusterMetrics.getNumActiveNMs(), 2);
    rm.stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)

Aggregations

NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)49 Test (org.junit.Test)33 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)23 Configuration (org.apache.hadoop.conf.Configuration)21 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)16 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)13 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)12 ArrayList (java.util.ArrayList)10 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)9 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)8 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)7 NodeHeartbeatRequest (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest)7 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)7 RMNodeStatusEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent)7 Container (org.apache.hadoop.yarn.api.records.Container)6 Resource (org.apache.hadoop.yarn.api.records.Resource)6 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)6 NodeHealthStatus (org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)6 ByteBuffer (java.nio.ByteBuffer)5 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)5