use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class NMSimulator method middleStep.
@Override
public void middleStep() throws Exception {
// we check the lifetime for each running containers
ContainerSimulator cs = null;
synchronized (completedContainerList) {
while ((cs = containerQueue.poll()) != null) {
runningContainers.remove(cs.getId());
completedContainerList.add(cs.getId());
LOG.debug(MessageFormat.format("Container {0} has completed", cs.getId()));
}
}
// send heart beat
NodeHeartbeatRequest beatRequest = Records.newRecord(NodeHeartbeatRequest.class);
beatRequest.setLastKnownNMTokenMasterKey(masterKey);
NodeStatus ns = Records.newRecord(NodeStatus.class);
ns.setContainersStatuses(generateContainerStatusList());
ns.setNodeId(node.getNodeID());
ns.setKeepAliveApplications(new ArrayList<ApplicationId>());
ns.setResponseId(RESPONSE_ID++);
ns.setNodeHealthStatus(NodeHealthStatus.newInstance(true, "", 0));
beatRequest.setNodeStatus(ns);
NodeHeartbeatResponse beatResponse = rm.getResourceTrackerService().nodeHeartbeat(beatRequest);
if (!beatResponse.getContainersToCleanup().isEmpty()) {
// remove from queue
synchronized (releasedContainerList) {
for (ContainerId containerId : beatResponse.getContainersToCleanup()) {
if (amContainerList.contains(containerId)) {
// AM container (not killed?, only release)
synchronized (amContainerList) {
amContainerList.remove(containerId);
}
LOG.debug(MessageFormat.format("NodeManager {0} releases " + "an AM ({1}).", node.getNodeID(), containerId));
} else {
cs = runningContainers.remove(containerId);
containerQueue.remove(cs);
releasedContainerList.add(containerId);
LOG.debug(MessageFormat.format("NodeManager {0} releases a " + "container ({1}).", node.getNodeID(), containerId));
}
}
}
}
if (beatResponse.getNodeAction() == NodeAction.SHUTDOWN) {
lastStep();
}
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class YarnServerBuilderUtils method newNodeHeartbeatResponse.
public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId, NodeAction action, List<ContainerId> containersToCleanUp, List<ApplicationId> applicationsToCleanUp, MasterKey containerTokenMasterKey, MasterKey nmTokenMasterKey, long nextHeartbeatInterval) {
NodeHeartbeatResponse response = recordFactory.newRecordInstance(NodeHeartbeatResponse.class);
response.setResponseId(responseId);
response.setNodeAction(action);
response.setContainerTokenMasterKey(containerTokenMasterKey);
response.setNMTokenMasterKey(nmTokenMasterKey);
response.setNextHeartBeatInterval(nextHeartbeatInterval);
if (containersToCleanUp != null) {
response.addAllContainersToCleanup(containersToCleanUp);
}
if (applicationsToCleanUp != null) {
response.addAllApplicationsToCleanup(applicationsToCleanUp);
}
return response;
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestNodeStatusUpdater method testConcurrentAccessToSystemCredentials.
@Test
public void testConcurrentAccessToSystemCredentials() {
final Map<ApplicationId, ByteBuffer> testCredentials = new HashMap<>();
ByteBuffer byteBuffer = ByteBuffer.wrap(new byte[300]);
ApplicationId applicationId = ApplicationId.newInstance(123456, 120);
testCredentials.put(applicationId, byteBuffer);
final List<Throwable> exceptions = Collections.synchronizedList(new ArrayList<Throwable>());
final int NUM_THREADS = 10;
final CountDownLatch allDone = new CountDownLatch(NUM_THREADS);
final ExecutorService threadPool = HadoopExecutors.newFixedThreadPool(NUM_THREADS);
final AtomicBoolean stop = new AtomicBoolean(false);
try {
for (int i = 0; i < NUM_THREADS; i++) {
threadPool.submit(new Runnable() {
@Override
public void run() {
try {
for (int i = 0; i < 100 && !stop.get(); i++) {
NodeHeartbeatResponse nodeHeartBeatResponse = newNodeHeartbeatResponse(0, NodeAction.NORMAL, null, null, null, null, 0);
nodeHeartBeatResponse.setSystemCredentialsForApps(testCredentials);
NodeHeartbeatResponseProto proto = ((NodeHeartbeatResponsePBImpl) nodeHeartBeatResponse).getProto();
Assert.assertNotNull(proto);
}
} catch (Throwable t) {
exceptions.add(t);
stop.set(true);
} finally {
allDone.countDown();
}
}
});
}
int testTimeout = 2;
Assert.assertTrue("Timeout waiting for more than " + testTimeout + " " + "seconds", allDone.await(testTimeout, TimeUnit.SECONDS));
} catch (InterruptedException ie) {
exceptions.add(ie);
} finally {
threadPool.shutdownNow();
}
Assert.assertTrue("Test failed with exception(s)" + exceptions, exceptions.isEmpty());
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestRMRestart method testRMRestart.
@SuppressWarnings("rawtypes")
@Test(timeout = 180000)
public void testRMRestart() throws Exception {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
RMState rmState = memStore.getState();
Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
// PHASE 1: create state in an RM
// start RM
MockRM rm1 = createMockRM(conf, memStore);
// start like normal because state is empty
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
MockNM nm2 = new MockNM("127.0.0.2:5678", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// nm2 will not heartbeat with RM1
nm2.registerNode();
// create app that will finish and the final state should be saved.
RMApp app0 = rm1.submitApp(200);
RMAppAttempt attempt0 = app0.getCurrentAppAttempt();
// spot check that app is saved
Assert.assertEquals(1, rmAppState.size());
nm1.nodeHeartbeat(true);
MockAM am0 = rm1.sendAMLaunched(attempt0.getAppAttemptId());
am0.registerAppAttempt();
finishApplicationMaster(app0, rm1, nm1, am0);
// create app that gets launched and does allocate before RM restart
RMApp app1 = rm1.submitApp(200);
// assert app1 info is saved
ApplicationStateData appState = rmAppState.get(app1.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app1.getApplicationSubmissionContext().getApplicationId());
//kick the scheduling to allocate AM container
nm1.nodeHeartbeat(true);
// assert app1 attempt is saved
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
Assert.assertEquals(1, appState.getAttemptCount());
ApplicationAttemptStateData attemptState = appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
// launch the AM
MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
// AM request for containers
am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
// kick the scheduler
nm1.nodeHeartbeat(true);
List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// create app that does not get launched by RM before RM restart
RMApp app2 = rm1.submitApp(200);
// assert app2 info is saved
appState = rmAppState.get(app2.getApplicationId());
Assert.assertNotNull(appState);
Assert.assertEquals(0, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), app2.getApplicationSubmissionContext().getApplicationId());
// create unmanaged app
RMApp appUnmanaged = rm1.submitApp(200, "someApp", "someUser", null, true, null, conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS), null);
ApplicationAttemptId unmanagedAttemptId = appUnmanaged.getCurrentAppAttempt().getAppAttemptId();
// assert appUnmanaged info is saved
ApplicationId unmanagedAppId = appUnmanaged.getApplicationId();
appState = rmAppState.get(unmanagedAppId);
Assert.assertNotNull(appState);
// wait for attempt to reach LAUNCHED state
rm1.waitForState(unmanagedAttemptId, RMAppAttemptState.LAUNCHED);
rm1.waitForState(unmanagedAppId, RMAppState.ACCEPTED);
// assert unmanaged attempt info is saved
Assert.assertEquals(1, appState.getAttemptCount());
Assert.assertEquals(appState.getApplicationSubmissionContext().getApplicationId(), appUnmanaged.getApplicationSubmissionContext().getApplicationId());
// PHASE 2: create new RM and start from old state
// create new RM to represent restart and recover state
MockRM rm2 = createMockRM(conf, memStore);
// start new RM
rm2.start();
// change NM to point to new RM
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm2.setResourceTrackerService(rm2.getResourceTrackerService());
// verify load of old state
// 4 apps are loaded.
// FINISHED app and attempt is also loaded back.
// Unmanaged app state is still loaded back but it cannot be restarted by
// the RM. this will change with work preserving RM restart in which AMs/NMs
// are not rebooted.
Assert.assertEquals(4, rm2.getRMContext().getRMApps().size());
// check that earlier finished app and attempt is also loaded back and move
// to finished state.
rm2.waitForState(app0.getApplicationId(), RMAppState.FINISHED);
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
// verify correct number of attempts and other data
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
Assert.assertNotNull(loadedApp1);
Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
Assert.assertEquals(app1.getApplicationSubmissionContext().getApplicationId(), loadedApp1.getApplicationSubmissionContext().getApplicationId());
RMApp loadedApp2 = rm2.getRMContext().getRMApps().get(app2.getApplicationId());
Assert.assertNotNull(loadedApp2);
//Assert.assertEquals(0, loadedApp2.getAppAttempts().size());
Assert.assertEquals(app2.getApplicationSubmissionContext().getApplicationId(), loadedApp2.getApplicationSubmissionContext().getApplicationId());
// verify state machine kicked into expected states
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
rm2.waitForState(loadedApp2.getApplicationId(), RMAppState.ACCEPTED);
// verify attempts for apps
// The app for which AM was started will wait for previous am
// container finish event to arrive. However for an application for which
// no am container was running will start new application attempt.
Assert.assertEquals(1, loadedApp1.getAppAttempts().size());
Assert.assertEquals(1, loadedApp2.getAppAttempts().size());
// verify old AM is not accepted
// change running AM to talk to new RM
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
try {
am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
Assert.fail();
} catch (ApplicationAttemptNotFoundException e) {
Assert.assertTrue(e instanceof ApplicationAttemptNotFoundException);
}
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
nm2 = new MockNM("127.0.0.2:5678", 15120, rm2.getResourceTrackerService());
NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
nm1.registerNode(Arrays.asList(status), null);
nm2.registerNode();
rm2.waitForState(loadedApp1.getApplicationId(), RMAppState.ACCEPTED);
// wait for the 2nd attempt to be started.
int timeoutSecs = 0;
while (loadedApp1.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
;
Thread.sleep(200);
}
// verify no more reboot response sent
hbResponse = nm1.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
hbResponse = nm2.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.RESYNC != hbResponse.getNodeAction());
// assert app1 attempt is saved
attempt1 = loadedApp1.getCurrentAppAttempt();
attemptId1 = attempt1.getAppAttemptId();
rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp1.getApplicationId());
attemptState = appState.getAttempt(attemptId1);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId1, 1), attemptState.getMasterContainer().getId());
// Nodes on which the AM's run
MockNM am1Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
am1Node = nm2;
}
// assert app2 attempt is saved
RMAppAttempt attempt2 = loadedApp2.getCurrentAppAttempt();
ApplicationAttemptId attemptId2 = attempt2.getAppAttemptId();
rm2.waitForState(attemptId2, RMAppAttemptState.ALLOCATED);
appState = rmAppState.get(loadedApp2.getApplicationId());
attemptState = appState.getAttempt(attemptId2);
Assert.assertNotNull(attemptState);
Assert.assertEquals(BuilderUtils.newContainerId(attemptId2, 1), attemptState.getMasterContainer().getId());
MockNM am2Node = nm1;
if (attemptState.getMasterContainer().getNodeId().toString().contains("127.0.0.2")) {
am2Node = nm2;
}
// start the AM's
am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
MockAM am2 = rm2.sendAMLaunched(attempt2.getAppAttemptId());
am2.registerAppAttempt();
//request for containers
am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
am2.allocate("127.0.0.2", 1000, 1, new ArrayList<ContainerId>());
// verify container allocate continues to work
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() == 0) {
nm1.nodeHeartbeat(true);
nm2.nodeHeartbeat(true);
conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// finish the AMs
finishApplicationMaster(loadedApp1, rm2, am1Node, am1);
finishApplicationMaster(loadedApp2, rm2, am2Node, am2);
// stop RM's
rm2.stop();
rm1.stop();
// completed apps are not removed immediately after app finish
// And finished app is also loaded back.
Assert.assertEquals(4, rmAppState.size());
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestResourceTrackerService method testGracefulDecommissionNoApp.
/**
* Graceful decommission node with no running application.
*/
@Test
public void testGracefulDecommissionNoApp() throws Exception {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile.getAbsolutePath());
writeToHostsFile("");
rm = new MockRM(conf);
rm.start();
MockNM nm1 = rm.registerNode("host1:1234", 5120);
MockNM nm2 = rm.registerNode("host2:5678", 10240);
MockNM nm3 = rm.registerNode("host3:4433", 5120);
int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();
NodeHeartbeatResponse nodeHeartbeat1 = nm1.nodeHeartbeat(true);
NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true);
NodeHeartbeatResponse nodeHeartbeat3 = nm3.nodeHeartbeat(true);
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat1.getNodeAction()));
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat2.getNodeAction()));
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat3.getNodeAction()));
rm.waitForState(nm2.getNodeId(), NodeState.RUNNING);
rm.waitForState(nm3.getNodeId(), NodeState.RUNNING);
// Graceful decommission both host2 and host3.
writeToHostsFile("host2", "host3");
rm.getNodesListManager().refreshNodes(conf, true);
rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONING);
rm.waitForState(nm3.getNodeId(), NodeState.DECOMMISSIONING);
nodeHeartbeat1 = nm1.nodeHeartbeat(true);
nodeHeartbeat2 = nm2.nodeHeartbeat(true);
nodeHeartbeat3 = nm3.nodeHeartbeat(true);
checkDecommissionedNMCount(rm, metricCount + 2);
rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONED);
rm.waitForState(nm3.getNodeId(), NodeState.DECOMMISSIONED);
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat1.getNodeAction()));
Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat2.getNodeAction());
Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat3.getNodeAction());
}
Aggregations