use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestRMNMSecretKeys method validateRMNMKeyExchange.
private void validateRMNMKeyExchange(YarnConfiguration conf) throws Exception {
// Default rolling and activation intervals are large enough, no need to
// intervene
final DrainDispatcher dispatcher = new DrainDispatcher();
ResourceManager rm = new ResourceManager() {
@Override
protected void doSecureLogin() throws IOException {
// Do nothing.
}
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
@Override
protected void startWepApp() {
// Don't need it, skip.
}
};
rm.init(conf);
rm.start();
// Testing ContainerToken and NMToken
String containerToken = "Container Token : ";
String nmToken = "NM Token : ";
MockNM nm = new MockNM("host:1234", 3072, rm.getResourceTrackerService());
RegisterNodeManagerResponse registrationResponse = nm.registerNode();
MasterKey containerTokenMasterKey = registrationResponse.getContainerTokenMasterKey();
Assert.assertNotNull(containerToken + "Registration should cause a key-update!", containerTokenMasterKey);
MasterKey nmTokenMasterKey = registrationResponse.getNMTokenMasterKey();
Assert.assertNotNull(nmToken + "Registration should cause a key-update!", nmTokenMasterKey);
dispatcher.await();
NodeHeartbeatResponse response = nm.nodeHeartbeat(true);
Assert.assertNull(containerToken + "First heartbeat after registration shouldn't get any key updates!", response.getContainerTokenMasterKey());
Assert.assertNull(nmToken + "First heartbeat after registration shouldn't get any key updates!", response.getNMTokenMasterKey());
dispatcher.await();
response = nm.nodeHeartbeat(true);
Assert.assertNull(containerToken + "Even second heartbeat after registration shouldn't get any key updates!", response.getContainerTokenMasterKey());
Assert.assertNull(nmToken + "Even second heartbeat after registration shouldn't get any key updates!", response.getContainerTokenMasterKey());
dispatcher.await();
// Let's force a roll-over
rm.getRMContext().getContainerTokenSecretManager().rollMasterKey();
rm.getRMContext().getNMTokenSecretManager().rollMasterKey();
// Heartbeats after roll-over and before activation should be fine.
response = nm.nodeHeartbeat(true);
Assert.assertNotNull(containerToken + "Heartbeats after roll-over and before activation should not err out.", response.getContainerTokenMasterKey());
Assert.assertNotNull(nmToken + "Heartbeats after roll-over and before activation should not err out.", response.getNMTokenMasterKey());
Assert.assertEquals(containerToken + "Roll-over should have incremented the key-id only by one!", containerTokenMasterKey.getKeyId() + 1, response.getContainerTokenMasterKey().getKeyId());
Assert.assertEquals(nmToken + "Roll-over should have incremented the key-id only by one!", nmTokenMasterKey.getKeyId() + 1, response.getNMTokenMasterKey().getKeyId());
dispatcher.await();
response = nm.nodeHeartbeat(true);
Assert.assertNull(containerToken + "Second heartbeat after roll-over shouldn't get any key updates!", response.getContainerTokenMasterKey());
Assert.assertNull(nmToken + "Second heartbeat after roll-over shouldn't get any key updates!", response.getNMTokenMasterKey());
dispatcher.await();
// Let's force activation
rm.getRMContext().getContainerTokenSecretManager().activateNextMasterKey();
rm.getRMContext().getNMTokenSecretManager().activateNextMasterKey();
response = nm.nodeHeartbeat(true);
Assert.assertNull(containerToken + "Activation shouldn't cause any key updates!", response.getContainerTokenMasterKey());
Assert.assertNull(nmToken + "Activation shouldn't cause any key updates!", response.getNMTokenMasterKey());
dispatcher.await();
response = nm.nodeHeartbeat(true);
Assert.assertNull(containerToken + "Even second heartbeat after activation shouldn't get any key updates!", response.getContainerTokenMasterKey());
Assert.assertNull(nmToken + "Even second heartbeat after activation shouldn't get any key updates!", response.getNMTokenMasterKey());
dispatcher.await();
rm.stop();
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class YarnServerBuilderUtils method newNodeHeartbeatResponse.
public static NodeHeartbeatResponse newNodeHeartbeatResponse(NodeAction action, String diagnosticsMessage) {
NodeHeartbeatResponse response = recordFactory.newRecordInstance(NodeHeartbeatResponse.class);
response.setNodeAction(action);
response.setDiagnosticsMessage(diagnosticsMessage);
return response;
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class ResourceTrackerService method nodeHeartbeat.
@SuppressWarnings("unchecked")
@Override
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException {
NodeStatus remoteNodeStatus = request.getNodeStatus();
/**
* Here is the node heartbeat sequence...
* 1. Check if it's a valid (i.e. not excluded) node
* 2. Check if it's a registered node
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
* 4. Send healthStatus to RMNode
* 5. Update node's labels if distributed Node Labels configuration is enabled
*/
NodeId nodeId = remoteNodeStatus.getNodeId();
// in decommissioning.
if (!this.nodesListManager.isValidNode(nodeId.getHost()) && !isNodeInDecommissioning(nodeId)) {
String message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId.getHost();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.SHUTDOWN, message);
}
// 2. Check if it's a registered node
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
if (rmNode == null) {
/* node does not exist */
String message = "Node not found resyncing " + remoteNodeStatus.getNodeId();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, message);
}
// Send ping
this.nmLivelinessMonitor.receivedPing(nodeId);
this.decommissioningWatcher.update(rmNode, remoteNodeStatus);
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse.getResponseId()) {
LOG.info("Received duplicate heartbeat from node " + rmNode.getNodeAddress() + " responseId=" + remoteNodeStatus.getResponseId());
return lastNodeHeartbeatResponse;
} else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse.getResponseId()) {
String message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId();
LOG.info(message);
// TODO: Just sending reboot is not enough. Think more.
this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING));
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, message);
}
boolean timelineV2Enabled = YarnConfiguration.timelineServiceV2Enabled(getConfig());
if (timelineV2Enabled) {
// Check & update collectors info from request.
// TODO make sure it won't have race condition issue for AM failed over
// case that the older registration could possible override the newer
// one.
updateAppCollectorsMap(request);
}
// Evaluate whether a DECOMMISSIONING node is ready to be DECOMMISSIONED.
if (rmNode.getState() == NodeState.DECOMMISSIONING && decommissioningWatcher.checkReadyToBeDecommissioned(rmNode.getNodeID())) {
String message = "DECOMMISSIONING " + nodeId + " is ready to be decommissioned";
LOG.info(message);
this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
this.nmLivelinessMonitor.unregister(nodeId);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.SHUTDOWN, message);
}
// Heartbeat response
NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(lastNodeHeartbeatResponse.getResponseId() + 1, NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval);
rmNode.updateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse);
rmNode.updateNodeHeartbeatResponseForContainersDecreasing(nodeHeartBeatResponse);
populateKeys(request, nodeHeartBeatResponse);
ConcurrentMap<ApplicationId, ByteBuffer> systemCredentials = rmContext.getSystemCredentialsForApps();
if (!systemCredentials.isEmpty()) {
nodeHeartBeatResponse.setSystemCredentialsForApps(systemCredentials);
}
if (timelineV2Enabled) {
// Return collectors' map that NM needs to know
setAppCollectorsMapToResponse(rmNode.getRunningApps(), nodeHeartBeatResponse);
}
// 4. Send status to RMNode, saving the latest response.
RMNodeStatusEvent nodeStatusEvent = new RMNodeStatusEvent(nodeId, remoteNodeStatus, nodeHeartBeatResponse);
if (request.getLogAggregationReportsForApps() != null && !request.getLogAggregationReportsForApps().isEmpty()) {
nodeStatusEvent.setLogAggregationReportsForApps(request.getLogAggregationReportsForApps());
}
this.rmContext.getDispatcher().getEventHandler().handle(nodeStatusEvent);
// 5. Update node's labels to RM's NodeLabelManager.
if (isDistributedNodeLabelsConf && request.getNodeLabels() != null) {
try {
updateNodeLabelsFromNMReport(NodeLabelsUtils.convertToStringSet(request.getNodeLabels()), nodeId);
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(true);
} catch (IOException ex) {
//ensure the error message is captured and sent across in response
nodeHeartBeatResponse.setDiagnosticsMessage(ex.getMessage());
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(false);
}
}
// 6. check if node's capacity is load from dynamic-resources.xml
// if so, send updated resource back to NM.
String nid = nodeId.toString();
Resource capability = loadNodeResourceFromDRConfiguration(nid);
// sync back with new resource if not null.
if (capability != null) {
nodeHeartBeatResponse.setResource(capability);
}
// the node to truncate the number of Containers queued for execution.
if (this.rmContext.getNodeManagerQueueLimitCalculator() != null) {
nodeHeartBeatResponse.setContainerQueuingLimit(this.rmContext.getNodeManagerQueueLimitCalculator().createContainerQueuingLimit());
}
return nodeHeartBeatResponse;
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.
the class TestAMRMClientOnRMRestart method testAMRMClientResendsRequestsOnRMRestart.
// Test does major 6 steps verification.
// Step-1 : AMRMClient send allocate request for 3 container requests
// Step-2 : 3 containers are allocated by RM.
// Step-3 : AM Send 1 containerRequest(cRequest4) and 1 releaseRequests to
// RM
// Step-3.5 : AM Send 1 container resource increase request to RM
// Step-4 : On RM restart, AM(does not know RM is restarted) sends additional
// containerRequest(cRequest5) and blacklisted nodes.
// Intern RM send resync command
// Verify AM can recover increase request after resync
// Step-5 : Allocater after resync command & new containerRequest(cRequest6)
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
@Test(timeout = 60000)
public void testAMRMClientResendsRequestsOnRMRestart() throws Exception {
UserGroupInformation.setLoginUser(null);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// Phase-1 Start 1st RM
MyResourceManager rm1 = new MyResourceManager(conf, memStore);
rm1.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
// Submit the application
RMApp app = rm1.submitApp(1024);
dispatcher.await();
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm1.sendAMLaunched(appAttemptId);
dispatcher.await();
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
ugi.addTokenIdentifier(token.decodeIdentifier());
// Step-1 : AMRMClient send allocate request for 3 ContainerRequest
// cRequest1 = h1, cRequest2 = h1,h2 and cRequest3 = h1
// blacklisted nodes = h2
AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
amClient.init(conf);
amClient.start();
amClient.registerApplicationMaster("Host", 10000, "");
ContainerRequest cRequest1 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest1);
ContainerRequest cRequest2 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest2);
ContainerRequest cRequest3 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest3);
List<String> blacklistAdditions = new ArrayList<String>();
List<String> blacklistRemoval = new ArrayList<String>();
blacklistAdditions.add("h2");
blacklistRemoval.add("h10");
amClient.updateBlacklist(blacklistAdditions, blacklistRemoval);
// remove from local list
blacklistAdditions.remove("h2");
AllocateResponse allocateResponse = amClient.allocate(0.1f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
// Why 4 ask, why not 3 ask even h2 is blacklisted?
// On blacklisting host,applicationmaster has to remove ask request from
// remoterequest table.Here,test does not remove explicitely
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(1, 1, rm1);
// Step-2 : NM heart beat is sent.
// On 2nd AM allocate request, RM allocates 3 containers to AM
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
// 3 containers are allocated i.e for cRequest1, cRequest2 and cRequest3.
Assert.assertEquals("No of assignments must be 0", 3, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(0, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
List<Container> allocatedContainers = allocateResponse.getAllocatedContainers();
// removed allocated container requests
amClient.removeContainerRequest(cRequest1);
amClient.removeContainerRequest(cRequest2);
amClient.removeContainerRequest(cRequest3);
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
// Step-3 : Send 1 containerRequest and 1 releaseRequests to RM
ContainerRequest cRequest4 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest4);
int pendingRelease = 0;
Iterator<Container> it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
// remove one container
break;
}
// Step-3.5 : Send 1 container resource increase request to RM
Container container = it.next();
ContainerId containerId = container.getId();
// Make sure that container is in RUNNING state before sending increase
// request
nm1.nodeHeartbeat(containerId.getApplicationAttemptId(), containerId.getContainerId(), ContainerState.RUNNING);
dispatcher.await();
amClient.requestContainerUpdate(container, UpdateContainerRequest.newInstance(container.getVersion(), container.getId(), ContainerUpdateType.INCREASE_RESOURCE, Resource.newInstance(2048, 1), null));
it.remove();
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(3, pendingRelease, rm1);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
int completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
// Phase-2 start 2nd RM is up
MyResourceManager rm2 = new MyResourceManager(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
nm1.registerNode(Collections.singletonList(containerReport), Collections.singletonList(containerId.getApplicationAttemptId().getApplicationId()));
nm1.nodeHeartbeat(true);
dispatcher.await();
blacklistAdditions.add("h3");
amClient.updateBlacklist(blacklistAdditions, null);
blacklistAdditions.remove("h3");
it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
}
ContainerRequest cRequest5 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest5);
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional
// containerRequest and blacklisted nodes.
// Intern RM send resync command,AMRMClient resend allocate request
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
assertAsksAndReleases(4, pendingRelease, rm2);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm2);
assertBlacklistAdditionsAndRemovals(2, 0, rm2);
ContainerRequest cRequest6 = createReq(1, 1024, new String[] { "h1", "h2", "h3" });
amClient.addContainerRequest(cRequest6);
// Step-5 : Allocater after resync command
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(5, 0, rm2);
// Verify there is no increase or decrease requests any more
assertChanges(0, 0, rm2);
assertBlacklistAdditionsAndRemovals(0, 0, rm2);
int noAssignedContainer = 0;
int count = 5;
while (count-- > 0) {
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
noAssignedContainer += allocateResponse.getAllocatedContainers().size();
if (noAssignedContainer == 3) {
break;
}
Thread.sleep(1000);
}
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
Assert.assertEquals("Number of container should be 3", 3, noAssignedContainer);
amClient.stop();
rm1.stop();
rm2.stop();
}
Aggregations