Search in sources :

Example 41 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestRMRestart method testRMRestartWaitForPreviousAMToFinish.

@Test(timeout = 60000)
public void testRMRestartWaitForPreviousAMToFinish() throws Exception {
    // testing 3 cases
    // After RM restarts
    // 1) New application attempt is not started until previous AM container
    // finish event is reported back to RM as a part of nm registration.
    // 2) If previous AM container finish event is never reported back (i.e.
    // node manager on which this AM container was running also went down) in
    // that case AMLivenessMonitor should time out previous attempt and start
    // new attempt.
    // 3) If all the stored attempts had finished then new attempt should
    // be started immediately.
    YarnConfiguration conf = new YarnConfiguration(this.conf);
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 40);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
    // start RM
    final MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    AbstractYarnScheduler ys = (AbstractYarnScheduler) rm1.getResourceScheduler();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 16382, rm1.getResourceTrackerService());
    nm1.registerNode();
    // submitting app
    RMApp app1 = rm1.submitApp(200);
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am1 = launchAM(app1, rm1, nm1);
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    // Fail first AM.
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am1.getApplicationAttemptId());
    // launch another AM.
    MockAM am2 = launchAM(app1, rm1, nm1);
    Assert.assertEquals(1, rmAppState.size());
    Assert.assertEquals(app1.getState(), RMAppState.RUNNING);
    Assert.assertEquals(app1.getAppAttempts().get(app1.getCurrentAppAttempt().getAppAttemptId()).getAppAttemptState(), RMAppAttemptState.RUNNING);
    //  start new RM.
    MockRM rm2 = createMockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NodeHeartbeatResponse res = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction());
    RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    // application should be in ACCEPTED state
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
    // new attempt should not be started
    Assert.assertEquals(2, rmApp.getAppAttempts().size());
    // am1 attempt should be in FAILED state where as am2 attempt should be in
    // LAUNCHED state
    rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
    NMContainerStatus status = TestRMRestart.createNMContainerStatus(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(status), null);
    rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    ys = (AbstractYarnScheduler) rm2.getResourceScheduler();
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am2.getApplicationAttemptId());
    launchAM(rmApp, rm2, nm1);
    Assert.assertEquals(3, rmApp.getAppAttempts().size());
    rm2.waitForState(rmApp.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.RUNNING);
    // Now restart RM ...
    // Setting AMLivelinessMonitor interval to be 10 Secs. 
    conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 10000);
    MockRM rm3 = createMockRM(conf, memStore);
    rm3.start();
    // Wait for RM to process all the events as a part of rm recovery.
    nm1.setResourceTrackerService(rm3.getResourceTrackerService());
    rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId());
    // application should be in ACCEPTED state
    rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED);
    // new attempt should not be started
    Assert.assertEquals(3, rmApp.getAppAttempts().size());
    // am1 and am2 attempts should be in FAILED state where as am3 should be
    // in LAUNCHED state
    rm3.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm3.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    ApplicationAttemptId latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
    rm3.waitForState(latestAppAttemptId, RMAppAttemptState.LAUNCHED);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    rm3.waitForState(latestAppAttemptId, RMAppAttemptState.FAILED);
    rm3.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
    final int maxRetry = 10;
    final RMApp rmAppForCheck = rmApp;
    GenericTestUtils.waitFor(new Supplier<Boolean>() {

        @Override
        public Boolean get() {
            return new Boolean(rmAppForCheck.getAppAttempts().size() == 4);
        }
    }, 100, maxRetry);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
    // The 4th attempt has started but is not yet saved into RMStateStore
    // It will be saved only when we launch AM.
    // submitting app but not starting AM for it.
    RMApp app2 = rm3.submitApp(200);
    rm3.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(1, app2.getAppAttempts().size());
    Assert.assertEquals(0, memStore.getState().getApplicationState().get(app2.getApplicationId()).getAttemptCount());
    MockRM rm4 = createMockRM(conf, memStore);
    rm4.start();
    rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId());
    rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
    // wait for the attempt to be created.
    int timeoutSecs = 0;
    while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
        Thread.sleep(200);
    }
    Assert.assertEquals(4, rmApp.getAppAttempts().size());
    Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
    rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED);
    Assert.assertEquals(RMAppAttemptState.SCHEDULED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    // The initial application for which an AM was not started should be in
    // ACCEPTED state with one application attempt started.
    app2 = rm4.getRMContext().getRMApps().get(app2.getApplicationId());
    rm4.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(RMAppState.ACCEPTED, app2.getState());
    Assert.assertEquals(1, app2.getAppAttempts().size());
    rm4.waitForState(app2.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.SCHEDULED);
    Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2.getCurrentAppAttempt().getAppAttemptState());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Example 42 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestRMRestart method testDecomissionedNMsMetricsOnRMRestart.

@Test(timeout = 60000)
public void testDecomissionedNMsMetricsOnRMRestart() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    writeToHostsFile("");
    MockRM rm1 = null, rm2 = null;
    try {
        rm1 = new MockRM(conf);
        rm1.start();
        MockNM nm1 = rm1.registerNode("localhost:1234", 8000);
        MockNM nm2 = rm1.registerNode("host2:1234", 8000);
        Resource expectedCapability = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
        String expectedVersion = nm1.getVersion();
        Assert.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
        String ip = NetUtils.normalizeHostName("localhost");
        // Add 2 hosts to exclude list.
        writeToHostsFile("host2", ip);
        // refresh nodes
        rm1.getNodesListManager().refreshNodes(conf);
        NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
        Assert.assertTrue(NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
        nodeHeartbeat = nm2.nodeHeartbeat(true);
        Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
        rm1.drainEvents();
        Assert.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
        verifyNodesAfterDecom(rm1, 2, expectedCapability, expectedVersion);
        rm1.stop();
        rm1 = null;
        Assert.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
        // restart RM.
        rm2 = new MockRM(conf);
        rm2.start();
        rm2.drainEvents();
        Assert.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
        verifyNodesAfterDecom(rm2, 2, Resource.newInstance(0, 0), "unknown");
    } finally {
        if (rm1 != null) {
            rm1.stop();
        }
        if (rm2 != null) {
            rm2.stop();
        }
    }
}
Also used : NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Resource(org.apache.hadoop.yarn.api.records.Resource) Test(org.junit.Test)

Example 43 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeHeartBeatWithInvalidLabels.

@Test
public void testNodeHeartBeatWithInvalidLabels() throws Exception {
    writeToHostsFile("host2");
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    conf.set(YarnConfiguration.NODELABEL_CONFIGURATION_TYPE, YarnConfiguration.DISTRIBUTED_NODELABEL_CONFIGURATION_TYPE);
    final RMNodeLabelsManager nodeLabelsMgr = new NullRMNodeLabelsManager();
    rm = new MockRM(conf) {

        @Override
        protected RMNodeLabelsManager createNodeLabelManager() {
            return nodeLabelsMgr;
        }
    };
    rm.start();
    try {
        nodeLabelsMgr.addToCluserNodeLabelsWithDefaultExclusivity(toSet("A", "B", "C"));
    } catch (IOException e) {
        Assert.fail("Caught Exception while intializing");
        e.printStackTrace();
    }
    ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
    RegisterNodeManagerRequest registerReq = Records.newRecord(RegisterNodeManagerRequest.class);
    NodeId nodeId = NodeId.newInstance("host2", 1234);
    Resource capability = BuilderUtils.newResource(1024, 1);
    registerReq.setResource(capability);
    registerReq.setNodeId(nodeId);
    registerReq.setHttpPort(1234);
    registerReq.setNMVersion(YarnVersionInfo.getVersion());
    registerReq.setNodeLabels(toNodeLabelSet("A"));
    RegisterNodeManagerResponse registerResponse = resourceTrackerService.registerNodeManager(registerReq);
    NodeHeartbeatRequest heartbeatReq = Records.newRecord(NodeHeartbeatRequest.class);
    // Invalid heart beat labels
    heartbeatReq.setNodeLabels(toNodeLabelSet("B", "#C"));
    heartbeatReq.setNodeStatus(getNodeStatusObject(nodeId));
    heartbeatReq.setLastKnownNMTokenMasterKey(registerResponse.getNMTokenMasterKey());
    heartbeatReq.setLastKnownContainerTokenMasterKey(registerResponse.getContainerTokenMasterKey());
    NodeHeartbeatResponse nodeHeartbeatResponse = resourceTrackerService.nodeHeartbeat(heartbeatReq);
    // response should be NORMAL when RM heartbeat labels are rejected
    Assert.assertEquals("Response should be NORMAL when RM heartbeat labels" + " are rejected", NodeAction.NORMAL, nodeHeartbeatResponse.getNodeAction());
    Assert.assertFalse(nodeHeartbeatResponse.getAreNodeLabelsAcceptedByRM());
    Assert.assertNotNull(nodeHeartbeatResponse.getDiagnosticsMessage());
    rm.stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Resource(org.apache.hadoop.yarn.api.records.Resource) IOException(java.io.IOException) UnRegisterNodeManagerRequest(org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest) RegisterNodeManagerRequest(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest) NodeId(org.apache.hadoop.yarn.api.records.NodeId) RegisterNodeManagerResponse(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse) NodeHeartbeatRequest(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest) NullRMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager) RMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager) NullRMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager) Test(org.junit.Test)

Example 44 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestResourceTrackerService method testNodeHeartbeatWithCentralLabelConfig.

@Test
public void testNodeHeartbeatWithCentralLabelConfig() throws Exception {
    writeToHostsFile("host2");
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
    conf.set(YarnConfiguration.NODELABEL_CONFIGURATION_TYPE, YarnConfiguration.DEFAULT_NODELABEL_CONFIGURATION_TYPE);
    final RMNodeLabelsManager nodeLabelsMgr = new NullRMNodeLabelsManager();
    rm = new MockRM(conf) {

        @Override
        protected RMNodeLabelsManager createNodeLabelManager() {
            return nodeLabelsMgr;
        }
    };
    rm.start();
    ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
    RegisterNodeManagerRequest req = Records.newRecord(RegisterNodeManagerRequest.class);
    NodeId nodeId = NodeId.newInstance("host2", 1234);
    Resource capability = BuilderUtils.newResource(1024, 1);
    req.setResource(capability);
    req.setNodeId(nodeId);
    req.setHttpPort(1234);
    req.setNMVersion(YarnVersionInfo.getVersion());
    req.setNodeLabels(toNodeLabelSet("A", "B", "C"));
    RegisterNodeManagerResponse registerResponse = resourceTrackerService.registerNodeManager(req);
    NodeHeartbeatRequest heartbeatReq = Records.newRecord(NodeHeartbeatRequest.class);
    // Valid heart beat labels
    heartbeatReq.setNodeLabels(toNodeLabelSet("B"));
    heartbeatReq.setNodeStatus(getNodeStatusObject(nodeId));
    heartbeatReq.setLastKnownNMTokenMasterKey(registerResponse.getNMTokenMasterKey());
    heartbeatReq.setLastKnownContainerTokenMasterKey(registerResponse.getContainerTokenMasterKey());
    NodeHeartbeatResponse nodeHeartbeatResponse = resourceTrackerService.nodeHeartbeat(heartbeatReq);
    // response should be ok but the RMacceptNodeLabelsUpdate should be false
    Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeatResponse.getNodeAction());
    // no change in the labels,
    Assert.assertNull(nodeLabelsMgr.getNodeLabels().get(nodeId));
    // heartbeat labels rejected
    Assert.assertFalse("Invalid Node Labels should not accepted by RM", nodeHeartbeatResponse.getAreNodeLabelsAcceptedByRM());
    if (rm != null) {
        rm.stop();
    }
}
Also used : UnRegisterNodeManagerRequest(org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest) RegisterNodeManagerRequest(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) NodeId(org.apache.hadoop.yarn.api.records.NodeId) Resource(org.apache.hadoop.yarn.api.records.Resource) RegisterNodeManagerResponse(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse) NodeHeartbeatRequest(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest) NullRMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager) RMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager) NullRMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager) Test(org.junit.Test)

Example 45 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestRMNodeTransitions method getMockRMNodeStatusEventWithRunningApps.

private RMNodeStatusEvent getMockRMNodeStatusEventWithRunningApps() {
    NodeHeartbeatResponse response = mock(NodeHeartbeatResponse.class);
    NodeHealthStatus healthStatus = mock(NodeHealthStatus.class);
    Boolean yes = new Boolean(true);
    doReturn(yes).when(healthStatus).getIsNodeHealthy();
    RMNodeStatusEvent event = mock(RMNodeStatusEvent.class);
    doReturn(healthStatus).when(event).getNodeHealthStatus();
    doReturn(response).when(event).getLatestResponse();
    doReturn(RMNodeEventType.STATUS_UPDATE).when(event).getType();
    doReturn(getAppIdList()).when(event).getKeepAliveAppIds();
    return event;
}
Also used : NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) RMNodeStatusEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent) NodeHealthStatus(org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)

Aggregations

NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)49 Test (org.junit.Test)33 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)23 Configuration (org.apache.hadoop.conf.Configuration)21 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)16 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)13 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)12 ArrayList (java.util.ArrayList)10 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)9 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)8 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)7 NodeHeartbeatRequest (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest)7 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)7 RMNodeStatusEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent)7 Container (org.apache.hadoop.yarn.api.records.Container)6 Resource (org.apache.hadoop.yarn.api.records.Resource)6 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)6 NodeHealthStatus (org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)6 ByteBuffer (java.nio.ByteBuffer)5 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)5