use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class NodeStatusUpdaterImpl method registerWithRM.
@VisibleForTesting
protected void registerWithRM() throws YarnException, IOException {
RegisterNodeManagerResponse regNMResponse;
Set<NodeLabel> nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
// during RM recovery
synchronized (this.context) {
List<NMContainerStatus> containerReports = getNMContainerStatuses();
RegisterNodeManagerRequest request = RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource, nodeManagerVersionId, containerReports, getRunningApplications(), nodeLabels, physicalResource);
if (containerReports != null) {
LOG.info("Registering with RM using containers :" + containerReports);
}
regNMResponse = resourceTracker.registerNodeManager(request);
// Make sure rmIdentifier is set before we release the lock
this.rmIdentifier = regNMResponse.getRMIdentifier();
}
// if the Resource Manager instructs NM to shutdown.
if (NodeAction.SHUTDOWN.equals(regNMResponse.getNodeAction())) {
String message = "Message from ResourceManager: " + regNMResponse.getDiagnosticsMessage();
throw new YarnRuntimeException("Recieved SHUTDOWN signal from Resourcemanager, Registration of NodeManager failed, " + message);
}
// if ResourceManager version is too old then shutdown
if (!minimumResourceManagerVersion.equals("NONE")) {
if (minimumResourceManagerVersion.equals("EqualToNM")) {
minimumResourceManagerVersion = nodeManagerVersionId;
}
String rmVersion = regNMResponse.getRMVersion();
if (rmVersion == null) {
String message = "The Resource Manager's did not return a version. " + "Valid version cannot be checked.";
throw new YarnRuntimeException("Shutting down the Node Manager. " + message);
}
if (VersionUtil.compareVersions(rmVersion, minimumResourceManagerVersion) < 0) {
String message = "The Resource Manager's version (" + rmVersion + ") is less than the minimum " + "allowed version " + minimumResourceManagerVersion;
throw new YarnRuntimeException("Shutting down the Node Manager on RM " + "version error, " + message);
}
}
this.registeredWithRM = true;
MasterKey masterKey = regNMResponse.getContainerTokenMasterKey();
// StatusUpdater#start().
if (masterKey != null) {
this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
}
masterKey = regNMResponse.getNMTokenMasterKey();
if (masterKey != null) {
this.context.getNMTokenSecretManager().setMasterKey(masterKey);
}
StringBuilder successfullRegistrationMsg = new StringBuilder();
successfullRegistrationMsg.append("Registered with ResourceManager as ").append(this.nodeId);
Resource newResource = regNMResponse.getResource();
if (newResource != null) {
updateNMResource(newResource);
successfullRegistrationMsg.append(" with updated total resource of ").append(this.totalResource);
} else {
successfullRegistrationMsg.append(" with total resource of ").append(this.totalResource);
}
successfullRegistrationMsg.append(nodeLabelsHandler.verifyRMRegistrationResponseForNodeLabels(regNMResponse));
LOG.info(successfullRegistrationMsg);
LOG.info("Notifying ContainerManager to unblock new container-requests");
this.context.getContainerManager().setBlockNewContainerRequests(false);
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestContainerLaunch method internalKillTest.
private void internalKillTest(boolean delayed) throws Exception {
conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, delayed ? 1000 : 0);
containerManager.start();
// ////// Construct the Container-id
ApplicationId appId = ApplicationId.newInstance(1, 1);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cId = ContainerId.newContainerId(appAttemptId, 0);
File processStartFile = new File(tmpDir, "pid.txt").getAbsoluteFile();
// setup a script that can handle sigterm gracefully
File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
if (Shell.WINDOWS) {
writer.println("@echo \"Running testscript for delayed kill\"");
writer.println("@echo \"Writing pid to start file\"");
writer.println("@echo " + cId + "> " + processStartFile);
writer.println("@ping -n 100 127.0.0.1 >nul");
} else {
writer.println("#!/bin/bash\n\n");
writer.println("echo \"Running testscript for delayed kill\"");
writer.println("hello=\"Got SIGTERM\"");
writer.println("umask 0");
writer.println("trap \"echo $hello >> " + processStartFile + "\" SIGTERM");
writer.println("echo \"Writing pid to start file\"");
writer.println("echo $$ >> " + processStartFile);
writer.println("while true; do\nsleep 1s;\ndone");
}
writer.close();
FileUtil.setExecutable(scriptFile, true);
ContainerLaunchContext containerLaunchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
// upload the script file so that the container can run it
URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha = recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file.sh";
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
containerLaunchContext.setCommands(commands);
Priority priority = Priority.newInstance(10);
long createTime = 1234;
Token containerToken = createContainerToken(cId, priority, createTime);
StartContainerRequest scRequest = StartContainerRequest.newInstance(containerLaunchContext, containerToken);
List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
list.add(scRequest);
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
LOG.info("Waiting for process start-file to be created");
}
Assert.assertTrue("ProcessStartFile doesn't exist!", processStartFile.exists());
NMContainerStatus nmContainerStatus = containerManager.getContext().getContainers().get(cId).getNMContainerStatus();
Assert.assertEquals(priority, nmContainerStatus.getPriority());
// Now test the stop functionality.
List<ContainerId> containerIds = new ArrayList<ContainerId>();
containerIds.add(cId);
StopContainersRequest stopRequest = StopContainersRequest.newInstance(containerIds);
containerManager.stopContainers(stopRequest);
BaseContainerManagerTest.waitForContainerState(containerManager, cId, ContainerState.COMPLETE);
// if delayed container stop sends a sigterm followed by a sigkill
// otherwise sigkill is sent immediately
GetContainerStatusesRequest gcsRequest = GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus = containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER, containerStatus.getExitStatus());
// verify that the job object with ID matching container ID no longer exists.
if (Shell.WINDOWS || !delayed) {
Assert.assertFalse("Process is still alive!", DefaultContainerExecutor.containerIsAlive(cId.toString()));
} else {
BufferedReader reader = new BufferedReader(new FileReader(processStartFile));
boolean foundSigTermMessage = false;
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
if (line.contains("SIGTERM")) {
foundSigTermMessage = true;
break;
}
}
Assert.assertTrue("Did not find sigterm message", foundSigTermMessage);
reader.close();
}
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class ResourceTrackerService method handleNMContainerStatus.
/**
* Helper method to handle received ContainerStatus. If this corresponds to
* the completion of a master-container of a managed AM,
* we call the handler for RMAppAttemptContainerFinishedEvent.
*/
@SuppressWarnings("unchecked")
@VisibleForTesting
void handleNMContainerStatus(NMContainerStatus containerStatus, NodeId nodeId) {
ApplicationAttemptId appAttemptId = containerStatus.getContainerId().getApplicationAttemptId();
RMApp rmApp = rmContext.getRMApps().get(appAttemptId.getApplicationId());
if (rmApp == null) {
LOG.error("Received finished container : " + containerStatus.getContainerId() + " for unknown application " + appAttemptId.getApplicationId() + " Skipping.");
return;
}
if (rmApp.getApplicationSubmissionContext().getUnmanagedAM()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Ignoring container completion status for unmanaged AM " + rmApp.getApplicationId());
}
return;
}
RMAppAttempt rmAppAttempt = rmApp.getRMAppAttempt(appAttemptId);
if (rmAppAttempt == null) {
LOG.info("Ignoring not found attempt " + appAttemptId);
return;
}
Container masterContainer = rmAppAttempt.getMasterContainer();
if (masterContainer.getId().equals(containerStatus.getContainerId()) && containerStatus.getContainerState() == ContainerState.COMPLETE) {
ContainerStatus status = ContainerStatus.newInstance(containerStatus.getContainerId(), containerStatus.getContainerState(), containerStatus.getDiagnostics(), containerStatus.getContainerExitStatus());
// sending master container finished event.
RMAppAttemptContainerFinishedEvent evt = new RMAppAttemptContainerFinishedEvent(appAttemptId, status, nodeId);
rmContext.getDispatcher().getEventHandler().handle(evt);
}
}
use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.
the class TestAMRMClientOnRMRestart method testAMRMClientResendsRequestsOnRMRestart.
// Test does major 6 steps verification.
// Step-1 : AMRMClient send allocate request for 3 container requests
// Step-2 : 3 containers are allocated by RM.
// Step-3 : AM Send 1 containerRequest(cRequest4) and 1 releaseRequests to
// RM
// Step-3.5 : AM Send 1 container resource increase request to RM
// Step-4 : On RM restart, AM(does not know RM is restarted) sends additional
// containerRequest(cRequest5) and blacklisted nodes.
// Intern RM send resync command
// Verify AM can recover increase request after resync
// Step-5 : Allocater after resync command & new containerRequest(cRequest6)
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
@Test(timeout = 60000)
public void testAMRMClientResendsRequestsOnRMRestart() throws Exception {
UserGroupInformation.setLoginUser(null);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// Phase-1 Start 1st RM
MyResourceManager rm1 = new MyResourceManager(conf, memStore);
rm1.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
// Submit the application
RMApp app = rm1.submitApp(1024);
dispatcher.await();
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm1.sendAMLaunched(appAttemptId);
dispatcher.await();
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
ugi.addTokenIdentifier(token.decodeIdentifier());
// Step-1 : AMRMClient send allocate request for 3 ContainerRequest
// cRequest1 = h1, cRequest2 = h1,h2 and cRequest3 = h1
// blacklisted nodes = h2
AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
amClient.init(conf);
amClient.start();
amClient.registerApplicationMaster("Host", 10000, "");
ContainerRequest cRequest1 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest1);
ContainerRequest cRequest2 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest2);
ContainerRequest cRequest3 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest3);
List<String> blacklistAdditions = new ArrayList<String>();
List<String> blacklistRemoval = new ArrayList<String>();
blacklistAdditions.add("h2");
blacklistRemoval.add("h10");
amClient.updateBlacklist(blacklistAdditions, blacklistRemoval);
// remove from local list
blacklistAdditions.remove("h2");
AllocateResponse allocateResponse = amClient.allocate(0.1f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
// Why 4 ask, why not 3 ask even h2 is blacklisted?
// On blacklisting host,applicationmaster has to remove ask request from
// remoterequest table.Here,test does not remove explicitely
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(1, 1, rm1);
// Step-2 : NM heart beat is sent.
// On 2nd AM allocate request, RM allocates 3 containers to AM
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
// 3 containers are allocated i.e for cRequest1, cRequest2 and cRequest3.
Assert.assertEquals("No of assignments must be 0", 3, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(0, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
List<Container> allocatedContainers = allocateResponse.getAllocatedContainers();
// removed allocated container requests
amClient.removeContainerRequest(cRequest1);
amClient.removeContainerRequest(cRequest2);
amClient.removeContainerRequest(cRequest3);
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
// Step-3 : Send 1 containerRequest and 1 releaseRequests to RM
ContainerRequest cRequest4 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest4);
int pendingRelease = 0;
Iterator<Container> it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
// remove one container
break;
}
// Step-3.5 : Send 1 container resource increase request to RM
Container container = it.next();
ContainerId containerId = container.getId();
// Make sure that container is in RUNNING state before sending increase
// request
nm1.nodeHeartbeat(containerId.getApplicationAttemptId(), containerId.getContainerId(), ContainerState.RUNNING);
dispatcher.await();
amClient.requestContainerUpdate(container, UpdateContainerRequest.newInstance(container.getVersion(), container.getId(), ContainerUpdateType.INCREASE_RESOURCE, Resource.newInstance(2048, 1), null));
it.remove();
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(3, pendingRelease, rm1);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
int completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
// Phase-2 start 2nd RM is up
MyResourceManager rm2 = new MyResourceManager(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
nm1.registerNode(Collections.singletonList(containerReport), Collections.singletonList(containerId.getApplicationAttemptId().getApplicationId()));
nm1.nodeHeartbeat(true);
dispatcher.await();
blacklistAdditions.add("h3");
amClient.updateBlacklist(blacklistAdditions, null);
blacklistAdditions.remove("h3");
it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
}
ContainerRequest cRequest5 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest5);
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional
// containerRequest and blacklisted nodes.
// Intern RM send resync command,AMRMClient resend allocate request
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
assertAsksAndReleases(4, pendingRelease, rm2);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm2);
assertBlacklistAdditionsAndRemovals(2, 0, rm2);
ContainerRequest cRequest6 = createReq(1, 1024, new String[] { "h1", "h2", "h3" });
amClient.addContainerRequest(cRequest6);
// Step-5 : Allocater after resync command
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(5, 0, rm2);
// Verify there is no increase or decrease requests any more
assertChanges(0, 0, rm2);
assertBlacklistAdditionsAndRemovals(0, 0, rm2);
int noAssignedContainer = 0;
int count = 5;
while (count-- > 0) {
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
noAssignedContainer += allocateResponse.getAllocatedContainers().size();
if (noAssignedContainer == 3) {
break;
}
Thread.sleep(1000);
}
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
Assert.assertEquals("Number of container should be 3", 3, noAssignedContainer);
amClient.stop();
rm1.stop();
rm2.stop();
}
Aggregations