use of org.apache.hadoop.yarn.server.api.records.NodeStatus in project hadoop by apache.
the class TestResourceTrackerOnHA method testResourceTrackerOnHA.
@Test(timeout = 15000)
public void testResourceTrackerOnHA() throws Exception {
NodeId nodeId = NodeId.newInstance("localhost", 0);
Resource resource = Resource.newInstance(2048, 4);
// make sure registerNodeManager works when failover happens
RegisterNodeManagerRequest request = RegisterNodeManagerRequest.newInstance(nodeId, 0, resource, YarnVersionInfo.getVersion(), null, null);
resourceTracker.registerNodeManager(request);
Assert.assertTrue(waitForNodeManagerToConnect(10000, nodeId));
// restart the failover thread, and make sure nodeHeartbeat works
failoverThread = createAndStartFailoverThread();
NodeStatus status = NodeStatus.newInstance(NodeId.newInstance("localhost", 0), 0, null, null, null, null, null, null);
NodeHeartbeatRequest request2 = NodeHeartbeatRequest.newInstance(status, null, null, null);
resourceTracker.nodeHeartbeat(request2);
}
use of org.apache.hadoop.yarn.server.api.records.NodeStatus in project hadoop by apache.
the class TestProtocolRecords method testNodeHeartBeatRequest.
@Test
public void testNodeHeartBeatRequest() throws IOException {
NodeHeartbeatRequest record = Records.newRecord(NodeHeartbeatRequest.class);
NodeStatus nodeStatus = Records.newRecord(NodeStatus.class);
OpportunisticContainersStatus opportunisticContainersStatus = Records.newRecord(OpportunisticContainersStatus.class);
opportunisticContainersStatus.setEstimatedQueueWaitTime(123);
opportunisticContainersStatus.setWaitQueueLength(321);
nodeStatus.setOpportunisticContainersStatus(opportunisticContainersStatus);
record.setNodeStatus(nodeStatus);
NodeHeartbeatRequestPBImpl pb = new NodeHeartbeatRequestPBImpl(((NodeHeartbeatRequestPBImpl) record).getProto());
Assert.assertEquals(123, pb.getNodeStatus().getOpportunisticContainersStatus().getEstimatedQueueWaitTime());
Assert.assertEquals(321, pb.getNodeStatus().getOpportunisticContainersStatus().getWaitQueueLength());
}
use of org.apache.hadoop.yarn.server.api.records.NodeStatus in project hadoop by apache.
the class NodeStatusUpdaterImpl method getNodeStatus.
@VisibleForTesting
protected NodeStatus getNodeStatus(int responseId) throws IOException {
NodeHealthStatus nodeHealthStatus = this.context.getNodeHealthStatus();
nodeHealthStatus.setHealthReport(healthChecker.getHealthReport());
nodeHealthStatus.setIsNodeHealthy(healthChecker.isHealthy());
nodeHealthStatus.setLastHealthReportTime(healthChecker.getLastHealthReportTime());
if (LOG.isDebugEnabled()) {
LOG.debug("Node's health-status : " + nodeHealthStatus.getIsNodeHealthy() + ", " + nodeHealthStatus.getHealthReport());
}
List<ContainerStatus> containersStatuses = getContainerStatuses();
ResourceUtilization containersUtilization = getContainersUtilization();
ResourceUtilization nodeUtilization = getNodeUtilization();
List<org.apache.hadoop.yarn.api.records.Container> increasedContainers = getIncreasedContainers();
NodeStatus nodeStatus = NodeStatus.newInstance(nodeId, responseId, containersStatuses, createKeepAliveApplicationList(), nodeHealthStatus, containersUtilization, nodeUtilization, increasedContainers);
nodeStatus.setOpportunisticContainersStatus(getOpportunisticContainersStatus());
return nodeStatus;
}
use of org.apache.hadoop.yarn.server.api.records.NodeStatus in project hadoop by apache.
the class ResourceTrackerService method nodeHeartbeat.
@SuppressWarnings("unchecked")
@Override
public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException {
NodeStatus remoteNodeStatus = request.getNodeStatus();
/**
* Here is the node heartbeat sequence...
* 1. Check if it's a valid (i.e. not excluded) node
* 2. Check if it's a registered node
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
* 4. Send healthStatus to RMNode
* 5. Update node's labels if distributed Node Labels configuration is enabled
*/
NodeId nodeId = remoteNodeStatus.getNodeId();
// in decommissioning.
if (!this.nodesListManager.isValidNode(nodeId.getHost()) && !isNodeInDecommissioning(nodeId)) {
String message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId.getHost();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.SHUTDOWN, message);
}
// 2. Check if it's a registered node
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
if (rmNode == null) {
/* node does not exist */
String message = "Node not found resyncing " + remoteNodeStatus.getNodeId();
LOG.info(message);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, message);
}
// Send ping
this.nmLivelinessMonitor.receivedPing(nodeId);
this.decommissioningWatcher.update(rmNode, remoteNodeStatus);
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse.getResponseId()) {
LOG.info("Received duplicate heartbeat from node " + rmNode.getNodeAddress() + " responseId=" + remoteNodeStatus.getResponseId());
return lastNodeHeartbeatResponse;
} else if (remoteNodeStatus.getResponseId() + 1 < lastNodeHeartbeatResponse.getResponseId()) {
String message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.getResponseId() + " nm response id:" + remoteNodeStatus.getResponseId();
LOG.info(message);
// TODO: Just sending reboot is not enough. Think more.
this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeEvent(nodeId, RMNodeEventType.REBOOTING));
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.RESYNC, message);
}
boolean timelineV2Enabled = YarnConfiguration.timelineServiceV2Enabled(getConfig());
if (timelineV2Enabled) {
// Check & update collectors info from request.
// TODO make sure it won't have race condition issue for AM failed over
// case that the older registration could possible override the newer
// one.
updateAppCollectorsMap(request);
}
// Evaluate whether a DECOMMISSIONING node is ready to be DECOMMISSIONED.
if (rmNode.getState() == NodeState.DECOMMISSIONING && decommissioningWatcher.checkReadyToBeDecommissioned(rmNode.getNodeID())) {
String message = "DECOMMISSIONING " + nodeId + " is ready to be decommissioned";
LOG.info(message);
this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
this.nmLivelinessMonitor.unregister(nodeId);
return YarnServerBuilderUtils.newNodeHeartbeatResponse(NodeAction.SHUTDOWN, message);
}
// Heartbeat response
NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(lastNodeHeartbeatResponse.getResponseId() + 1, NodeAction.NORMAL, null, null, null, null, nextHeartBeatInterval);
rmNode.updateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse);
rmNode.updateNodeHeartbeatResponseForContainersDecreasing(nodeHeartBeatResponse);
populateKeys(request, nodeHeartBeatResponse);
ConcurrentMap<ApplicationId, ByteBuffer> systemCredentials = rmContext.getSystemCredentialsForApps();
if (!systemCredentials.isEmpty()) {
nodeHeartBeatResponse.setSystemCredentialsForApps(systemCredentials);
}
if (timelineV2Enabled) {
// Return collectors' map that NM needs to know
setAppCollectorsMapToResponse(rmNode.getRunningApps(), nodeHeartBeatResponse);
}
// 4. Send status to RMNode, saving the latest response.
RMNodeStatusEvent nodeStatusEvent = new RMNodeStatusEvent(nodeId, remoteNodeStatus, nodeHeartBeatResponse);
if (request.getLogAggregationReportsForApps() != null && !request.getLogAggregationReportsForApps().isEmpty()) {
nodeStatusEvent.setLogAggregationReportsForApps(request.getLogAggregationReportsForApps());
}
this.rmContext.getDispatcher().getEventHandler().handle(nodeStatusEvent);
// 5. Update node's labels to RM's NodeLabelManager.
if (isDistributedNodeLabelsConf && request.getNodeLabels() != null) {
try {
updateNodeLabelsFromNMReport(NodeLabelsUtils.convertToStringSet(request.getNodeLabels()), nodeId);
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(true);
} catch (IOException ex) {
//ensure the error message is captured and sent across in response
nodeHeartBeatResponse.setDiagnosticsMessage(ex.getMessage());
nodeHeartBeatResponse.setAreNodeLabelsAcceptedByRM(false);
}
}
// 6. check if node's capacity is load from dynamic-resources.xml
// if so, send updated resource back to NM.
String nid = nodeId.toString();
Resource capability = loadNodeResourceFromDRConfiguration(nid);
// sync back with new resource if not null.
if (capability != null) {
nodeHeartBeatResponse.setResource(capability);
}
// the node to truncate the number of Containers queued for execution.
if (this.rmContext.getNodeManagerQueueLimitCalculator() != null) {
nodeHeartBeatResponse.setContainerQueuingLimit(this.rmContext.getNodeManagerQueueLimitCalculator().createContainerQueuingLimit());
}
return nodeHeartBeatResponse;
}
Aggregations