use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestRMRestart method testRMRestartWaitForPreviousAMToFinish.
@Test(timeout = 60000)
public void testRMRestartWaitForPreviousAMToFinish() throws Exception {
// testing 3 cases
// After RM restarts
// 1) New application attempt is not started until previous AM container
// finish event is reported back to RM as a part of nm registration.
// 2) If previous AM container finish event is never reported back (i.e.
// node manager on which this AM container was running also went down) in
// that case AMLivenessMonitor should time out previous attempt and start
// new attempt.
// 3) If all the stored attempts had finished then new attempt should
// be started immediately.
YarnConfiguration conf = new YarnConfiguration(this.conf);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 40);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// start RM
final MockRM rm1 = createMockRM(conf, memStore);
rm1.start();
AbstractYarnScheduler ys = (AbstractYarnScheduler) rm1.getResourceScheduler();
MockNM nm1 = new MockNM("127.0.0.1:1234", 16382, rm1.getResourceTrackerService());
nm1.registerNode();
// submitting app
RMApp app1 = rm1.submitApp(200);
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am1 = launchAM(app1, rm1, nm1);
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
// Fail first AM.
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am1.getApplicationAttemptId());
// launch another AM.
MockAM am2 = launchAM(app1, rm1, nm1);
Assert.assertEquals(1, rmAppState.size());
Assert.assertEquals(app1.getState(), RMAppState.RUNNING);
Assert.assertEquals(app1.getAppAttempts().get(app1.getCurrentAppAttempt().getAppAttemptId()).getAppAttemptState(), RMAppAttemptState.RUNNING);
// start new RM.
MockRM rm2 = createMockRM(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
NodeHeartbeatResponse res = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction());
RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
// new attempt should not be started
Assert.assertEquals(2, rmApp.getAppAttempts().size());
// am1 attempt should be in FAILED state where as am2 attempt should be in
// LAUNCHED state
rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ys = (AbstractYarnScheduler) rm2.getResourceScheduler();
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am2.getApplicationAttemptId());
launchAM(rmApp, rm2, nm1);
Assert.assertEquals(3, rmApp.getAppAttempts().size());
rm2.waitForState(rmApp.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.RUNNING);
// Now restart RM ...
// Setting AMLivelinessMonitor interval to be 10 Secs.
conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 10000);
MockRM rm3 = createMockRM(conf, memStore);
rm3.start();
// Wait for RM to process all the events as a part of rm recovery.
nm1.setResourceTrackerService(rm3.getResourceTrackerService());
rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId());
// application should be in ACCEPTED state
rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED);
// new attempt should not be started
Assert.assertEquals(3, rmApp.getAppAttempts().size());
// am1 and am2 attempts should be in FAILED state where as am3 should be
// in LAUNCHED state
rm3.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm3.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
ApplicationAttemptId latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.LAUNCHED);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
rm3.waitForState(latestAppAttemptId, RMAppAttemptState.FAILED);
rm3.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
final int maxRetry = 10;
final RMApp rmAppForCheck = rmApp;
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return new Boolean(rmAppForCheck.getAppAttempts().size() == 4);
}
}, 100, maxRetry);
Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
// The 4th attempt has started but is not yet saved into RMStateStore
// It will be saved only when we launch AM.
// submitting app but not starting AM for it.
RMApp app2 = rm3.submitApp(200);
rm3.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(1, app2.getAppAttempts().size());
Assert.assertEquals(0, memStore.getState().getApplicationState().get(app2.getApplicationId()).getAttemptCount());
MockRM rm4 = createMockRM(conf, memStore);
rm4.start();
rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId());
rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
// wait for the attempt to be created.
int timeoutSecs = 0;
while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
Thread.sleep(200);
}
Assert.assertEquals(4, rmApp.getAppAttempts().size());
Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
// The initial application for which an AM was not started should be in
// ACCEPTED state with one application attempt started.
app2 = rm4.getRMContext().getRMApps().get(app2.getApplicationId());
rm4.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
Assert.assertEquals(RMAppState.ACCEPTED, app2.getState());
Assert.assertEquals(1, app2.getAppAttempts().size());
rm4.waitForState(app2.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.SCHEDULED);
Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2.getCurrentAppAttempt().getAppAttemptState());
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestRMRestart method testDecomissionedNMsMetricsOnRMRestart.
@Test(timeout = 60000)
public void testDecomissionedNMsMetricsOnRMRestart() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
writeToHostsFile("");
MockRM rm1 = null, rm2 = null;
try {
rm1 = new MockRM(conf);
rm1.start();
MockNM nm1 = rm1.registerNode("localhost:1234", 8000);
MockNM nm2 = rm1.registerNode("host2:1234", 8000);
Resource expectedCapability = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
String expectedVersion = nm1.getVersion();
Assert.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
String ip = NetUtils.normalizeHostName("localhost");
// Add 2 hosts to exclude list.
writeToHostsFile("host2", ip);
// refresh nodes
rm1.getNodesListManager().refreshNodes(conf);
NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
nodeHeartbeat = nm2.nodeHeartbeat(true);
Assert.assertTrue("The decommisioned metrics are not updated", NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
rm1.drainEvents();
Assert.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
verifyNodesAfterDecom(rm1, 2, expectedCapability, expectedVersion);
rm1.stop();
rm1 = null;
Assert.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
// restart RM.
rm2 = new MockRM(conf);
rm2.start();
rm2.drainEvents();
Assert.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
verifyNodesAfterDecom(rm2, 2, Resource.newInstance(0, 0), "unknown");
} finally {
if (rm1 != null) {
rm1.stop();
}
if (rm2 != null) {
rm2.stop();
}
}
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestResourceTrackerService method testNodeHeartBeatWithInvalidLabels.
@Test
public void testNodeHeartBeatWithInvalidLabels() throws Exception {
writeToHostsFile("host2");
Configuration conf = new Configuration();
conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
conf.set(YarnConfiguration.NODELABEL_CONFIGURATION_TYPE, YarnConfiguration.DISTRIBUTED_NODELABEL_CONFIGURATION_TYPE);
final RMNodeLabelsManager nodeLabelsMgr = new NullRMNodeLabelsManager();
rm = new MockRM(conf) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
return nodeLabelsMgr;
}
};
rm.start();
try {
nodeLabelsMgr.addToCluserNodeLabelsWithDefaultExclusivity(toSet("A", "B", "C"));
} catch (IOException e) {
Assert.fail("Caught Exception while intializing");
e.printStackTrace();
}
ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
RegisterNodeManagerRequest registerReq = Records.newRecord(RegisterNodeManagerRequest.class);
NodeId nodeId = NodeId.newInstance("host2", 1234);
Resource capability = BuilderUtils.newResource(1024, 1);
registerReq.setResource(capability);
registerReq.setNodeId(nodeId);
registerReq.setHttpPort(1234);
registerReq.setNMVersion(YarnVersionInfo.getVersion());
registerReq.setNodeLabels(toNodeLabelSet("A"));
RegisterNodeManagerResponse registerResponse = resourceTrackerService.registerNodeManager(registerReq);
NodeHeartbeatRequest heartbeatReq = Records.newRecord(NodeHeartbeatRequest.class);
// Invalid heart beat labels
heartbeatReq.setNodeLabels(toNodeLabelSet("B", "#C"));
heartbeatReq.setNodeStatus(getNodeStatusObject(nodeId));
heartbeatReq.setLastKnownNMTokenMasterKey(registerResponse.getNMTokenMasterKey());
heartbeatReq.setLastKnownContainerTokenMasterKey(registerResponse.getContainerTokenMasterKey());
NodeHeartbeatResponse nodeHeartbeatResponse = resourceTrackerService.nodeHeartbeat(heartbeatReq);
// response should be NORMAL when RM heartbeat labels are rejected
Assert.assertEquals("Response should be NORMAL when RM heartbeat labels" + " are rejected", NodeAction.NORMAL, nodeHeartbeatResponse.getNodeAction());
Assert.assertFalse(nodeHeartbeatResponse.getAreNodeLabelsAcceptedByRM());
Assert.assertNotNull(nodeHeartbeatResponse.getDiagnosticsMessage());
rm.stop();
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestResourceTrackerService method testNodeHeartbeatWithCentralLabelConfig.
@Test
public void testNodeHeartbeatWithCentralLabelConfig() throws Exception {
writeToHostsFile("host2");
Configuration conf = new Configuration();
conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath());
conf.set(YarnConfiguration.NODELABEL_CONFIGURATION_TYPE, YarnConfiguration.DEFAULT_NODELABEL_CONFIGURATION_TYPE);
final RMNodeLabelsManager nodeLabelsMgr = new NullRMNodeLabelsManager();
rm = new MockRM(conf) {
@Override
protected RMNodeLabelsManager createNodeLabelManager() {
return nodeLabelsMgr;
}
};
rm.start();
ResourceTrackerService resourceTrackerService = rm.getResourceTrackerService();
RegisterNodeManagerRequest req = Records.newRecord(RegisterNodeManagerRequest.class);
NodeId nodeId = NodeId.newInstance("host2", 1234);
Resource capability = BuilderUtils.newResource(1024, 1);
req.setResource(capability);
req.setNodeId(nodeId);
req.setHttpPort(1234);
req.setNMVersion(YarnVersionInfo.getVersion());
req.setNodeLabels(toNodeLabelSet("A", "B", "C"));
RegisterNodeManagerResponse registerResponse = resourceTrackerService.registerNodeManager(req);
NodeHeartbeatRequest heartbeatReq = Records.newRecord(NodeHeartbeatRequest.class);
// Valid heart beat labels
heartbeatReq.setNodeLabels(toNodeLabelSet("B"));
heartbeatReq.setNodeStatus(getNodeStatusObject(nodeId));
heartbeatReq.setLastKnownNMTokenMasterKey(registerResponse.getNMTokenMasterKey());
heartbeatReq.setLastKnownContainerTokenMasterKey(registerResponse.getContainerTokenMasterKey());
NodeHeartbeatResponse nodeHeartbeatResponse = resourceTrackerService.nodeHeartbeat(heartbeatReq);
// response should be ok but the RMacceptNodeLabelsUpdate should be false
Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeatResponse.getNodeAction());
// no change in the labels,
Assert.assertNull(nodeLabelsMgr.getNodeLabels().get(nodeId));
// heartbeat labels rejected
Assert.assertFalse("Invalid Node Labels should not accepted by RM", nodeHeartbeatResponse.getAreNodeLabelsAcceptedByRM());
if (rm != null) {
rm.stop();
}
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestRMNodeTransitions method getMockRMNodeStatusEventWithRunningApps.
private RMNodeStatusEvent getMockRMNodeStatusEventWithRunningApps() {
NodeHeartbeatResponse response = mock(NodeHeartbeatResponse.class);
NodeHealthStatus healthStatus = mock(NodeHealthStatus.class);
Boolean yes = new Boolean(true);
doReturn(yes).when(healthStatus).getIsNodeHealthy();
RMNodeStatusEvent event = mock(RMNodeStatusEvent.class);
doReturn(healthStatus).when(event).getNodeHealthStatus();
doReturn(response).when(event).getLatestResponse();
doReturn(RMNodeEventType.STATUS_UPDATE).when(event).getType();
doReturn(getAppIdList()).when(event).getKeepAliveAppIds();
return event;
}
Aggregations