use of org.apache.hadoop.yarn.api.records.LocalResource in project hadoop by apache.
the class TestContainerManagerRecovery method testNMRecoveryForAppFinishedWithLogAggregationFailure.
@Test
public void testNMRecoveryForAppFinishedWithLogAggregationFailure() throws Exception {
conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
NMStateStoreService stateStore = new NMMemoryStateStoreService();
stateStore.init(conf);
stateStore.start();
Context context = createContext(conf, stateStore);
ContainerManagerImpl cm = createContainerManager(context);
cm.init(conf);
cm.start();
// add an application by starting a container
ApplicationId appId = ApplicationId.newInstance(0, 1);
ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cid = ContainerId.newContainerId(attemptId, 1);
Map<String, LocalResource> localResources = Collections.emptyMap();
Map<String, String> containerEnv = Collections.emptyMap();
List<String> containerCmds = Collections.emptyList();
Map<String, ByteBuffer> serviceData = Collections.emptyMap();
ContainerLaunchContext clc = ContainerLaunchContext.newInstance(localResources, containerEnv, containerCmds, serviceData, null, null);
StartContainersResponse startResponse = startContainer(context, cm, cid, clc, null);
assertTrue(startResponse.getFailedRequests().isEmpty());
assertEquals(1, context.getApplications().size());
Application app = context.getApplications().get(appId);
assertNotNull(app);
waitForAppState(app, ApplicationState.INITING);
// simulate application completion
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
finishedApps.add(appId);
app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// application is still in NM context.
assertEquals(1, context.getApplications().size());
// restart and verify app is still there and marked as finished.
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertEquals(1, context.getApplications().size());
app = context.getApplications().get(appId);
assertNotNull(app);
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
app.handle(new ApplicationFinishEvent(appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
// TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
// is needed.
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_RESOURCES_CLEANEDUP));
assertEquals(app.getApplicationState(), ApplicationState.FINISHED);
// simulate log aggregation failed.
app.handle(new ApplicationEvent(app.getAppId(), ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
// restart and verify app is no longer present after recovery
cm.stop();
context = createContext(conf, stateStore);
cm = createContainerManager(context);
cm.init(conf);
cm.start();
assertTrue(context.getApplications().isEmpty());
cm.stop();
}
use of org.apache.hadoop.yarn.api.records.LocalResource in project hadoop by apache.
the class TestContainerLaunch method internalKillTest.
private void internalKillTest(boolean delayed) throws Exception {
conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, delayed ? 1000 : 0);
containerManager.start();
// ////// Construct the Container-id
ApplicationId appId = ApplicationId.newInstance(1, 1);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cId = ContainerId.newContainerId(appAttemptId, 0);
File processStartFile = new File(tmpDir, "pid.txt").getAbsoluteFile();
// setup a script that can handle sigterm gracefully
File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
if (Shell.WINDOWS) {
writer.println("@echo \"Running testscript for delayed kill\"");
writer.println("@echo \"Writing pid to start file\"");
writer.println("@echo " + cId + "> " + processStartFile);
writer.println("@ping -n 100 127.0.0.1 >nul");
} else {
writer.println("#!/bin/bash\n\n");
writer.println("echo \"Running testscript for delayed kill\"");
writer.println("hello=\"Got SIGTERM\"");
writer.println("umask 0");
writer.println("trap \"echo $hello >> " + processStartFile + "\" SIGTERM");
writer.println("echo \"Writing pid to start file\"");
writer.println("echo $$ >> " + processStartFile);
writer.println("while true; do\nsleep 1s;\ndone");
}
writer.close();
FileUtil.setExecutable(scriptFile, true);
ContainerLaunchContext containerLaunchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
// upload the script file so that the container can run it
URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha = recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file.sh";
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
containerLaunchContext.setCommands(commands);
Priority priority = Priority.newInstance(10);
long createTime = 1234;
Token containerToken = createContainerToken(cId, priority, createTime);
StartContainerRequest scRequest = StartContainerRequest.newInstance(containerLaunchContext, containerToken);
List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
list.add(scRequest);
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
LOG.info("Waiting for process start-file to be created");
}
Assert.assertTrue("ProcessStartFile doesn't exist!", processStartFile.exists());
NMContainerStatus nmContainerStatus = containerManager.getContext().getContainers().get(cId).getNMContainerStatus();
Assert.assertEquals(priority, nmContainerStatus.getPriority());
// Now test the stop functionality.
List<ContainerId> containerIds = new ArrayList<ContainerId>();
containerIds.add(cId);
StopContainersRequest stopRequest = StopContainersRequest.newInstance(containerIds);
containerManager.stopContainers(stopRequest);
BaseContainerManagerTest.waitForContainerState(containerManager, cId, ContainerState.COMPLETE);
// if delayed container stop sends a sigterm followed by a sigkill
// otherwise sigkill is sent immediately
GetContainerStatusesRequest gcsRequest = GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus = containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER, containerStatus.getExitStatus());
// verify that the job object with ID matching container ID no longer exists.
if (Shell.WINDOWS || !delayed) {
Assert.assertFalse("Process is still alive!", DefaultContainerExecutor.containerIsAlive(cId.toString()));
} else {
BufferedReader reader = new BufferedReader(new FileReader(processStartFile));
boolean foundSigTermMessage = false;
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
if (line.contains("SIGTERM")) {
foundSigTermMessage = true;
break;
}
}
Assert.assertTrue("Did not find sigterm message", foundSigTermMessage);
reader.close();
}
}
use of org.apache.hadoop.yarn.api.records.LocalResource in project hadoop by apache.
the class TestContainerLaunch method testKillProcessGroup.
@Test
public void testKillProcessGroup() throws Exception {
Assume.assumeTrue(Shell.isSetsidAvailable);
containerManager.start();
// Construct the Container-id
ApplicationId appId = ApplicationId.newInstance(2, 2);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
ContainerId cId = ContainerId.newContainerId(appAttemptId, 0);
File processStartFile = new File(tmpDir, "pid.txt").getAbsoluteFile();
File childProcessStartFile = new File(tmpDir, "child_pid.txt").getAbsoluteFile();
// setup a script that can handle sigterm gracefully
File scriptFile = Shell.appendScriptExtension(tmpDir, "testscript");
PrintWriter writer = new PrintWriter(new FileOutputStream(scriptFile));
writer.println("#!/bin/bash\n\n");
writer.println("echo \"Running testscript for forked process\"");
writer.println("umask 0");
writer.println("echo $$ >> " + processStartFile);
writer.println("while true;\ndo sleep 1s;\ndone > /dev/null 2>&1 &");
writer.println("echo $! >> " + childProcessStartFile);
writer.println("while true;\ndo sleep 1s;\ndone");
writer.close();
FileUtil.setExecutable(scriptFile, true);
ContainerLaunchContext containerLaunchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
// upload the script file so that the container can run it
URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(scriptFile.getAbsolutePath())));
LocalResource rsrc_alpha = recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(scriptFile.lastModified());
String destinationFile = "dest_file.sh";
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
// set up the rest of the container
List<String> commands = Arrays.asList(Shell.getRunScriptCommand(scriptFile));
containerLaunchContext.setCommands(commands);
Priority priority = Priority.newInstance(10);
long createTime = 1234;
Token containerToken = createContainerToken(cId, priority, createTime);
StartContainerRequest scRequest = StartContainerRequest.newInstance(containerLaunchContext, containerToken);
List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
list.add(scRequest);
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
int timeoutSecs = 0;
while (!processStartFile.exists() && timeoutSecs++ < 20) {
Thread.sleep(1000);
LOG.info("Waiting for process start-file to be created");
}
Assert.assertTrue("ProcessStartFile doesn't exist!", processStartFile.exists());
BufferedReader reader = new BufferedReader(new FileReader(processStartFile));
// Get the pid of the process
String pid = reader.readLine().trim();
// No more lines
Assert.assertEquals(null, reader.readLine());
reader.close();
reader = new BufferedReader(new FileReader(childProcessStartFile));
// Get the pid of the child process
String child = reader.readLine().trim();
// No more lines
Assert.assertEquals(null, reader.readLine());
reader.close();
LOG.info("Manually killing pid " + pid + ", but not child pid " + child);
Shell.execCommand(new String[] { "kill", "-9", pid });
BaseContainerManagerTest.waitForContainerState(containerManager, cId, ContainerState.COMPLETE);
Assert.assertFalse("Process is still alive!", DefaultContainerExecutor.containerIsAlive(pid));
List<ContainerId> containerIds = new ArrayList<ContainerId>();
containerIds.add(cId);
GetContainerStatusesRequest gcsRequest = GetContainerStatusesRequest.newInstance(containerIds);
ContainerStatus containerStatus = containerManager.getContainerStatuses(gcsRequest).getContainerStatuses().get(0);
Assert.assertEquals(ExitCode.FORCE_KILLED.getExitCode(), containerStatus.getExitStatus());
}
use of org.apache.hadoop.yarn.api.records.LocalResource in project hadoop by apache.
the class TestContainerManager method testContainerSetup.
@Test
public void testContainerSetup() throws Exception {
containerManager.start();
// ////// Create the resources for the container
File dir = new File(tmpDir, "dir");
dir.mkdirs();
File file = new File(dir, "file");
PrintWriter fileWriter = new PrintWriter(file);
fileWriter.write("Hello World!");
fileWriter.close();
// ////// Construct the Container-id
ContainerId cId = createContainerId(0);
// ////// Construct the container-spec.
ContainerLaunchContext containerLaunchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
URL resource_alpha = URL.fromPath(localFS.makeQualified(new Path(file.getAbsolutePath())));
LocalResource rsrc_alpha = recordFactory.newRecordInstance(LocalResource.class);
rsrc_alpha.setResource(resource_alpha);
rsrc_alpha.setSize(-1);
rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION);
rsrc_alpha.setType(LocalResourceType.FILE);
rsrc_alpha.setTimestamp(file.lastModified());
String destinationFile = "dest_file";
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(destinationFile, rsrc_alpha);
containerLaunchContext.setLocalResources(localResources);
StartContainerRequest scRequest = StartContainerRequest.newInstance(containerLaunchContext, createContainerToken(cId, DUMMY_RM_IDENTIFIER, context.getNodeId(), user, context.getContainerTokenSecretManager()));
List<StartContainerRequest> list = new ArrayList<>();
list.add(scRequest);
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
BaseContainerManagerTest.waitForContainerState(containerManager, cId, ContainerState.COMPLETE, 40);
// Now ascertain that the resources are localised correctly.
ApplicationId appId = cId.getApplicationAttemptId().getApplicationId();
String appIDStr = appId.toString();
String containerIDStr = cId.toString();
File userCacheDir = new File(localDir, ContainerLocalizer.USERCACHE);
File userDir = new File(userCacheDir, user);
File appCache = new File(userDir, ContainerLocalizer.APPCACHE);
File appDir = new File(appCache, appIDStr);
File containerDir = new File(appDir, containerIDStr);
File targetFile = new File(containerDir, destinationFile);
File sysDir = new File(localDir, ResourceLocalizationService.NM_PRIVATE_DIR);
File appSysDir = new File(sysDir, appIDStr);
File containerSysDir = new File(appSysDir, containerIDStr);
for (File f : new File[] { localDir, sysDir, userCacheDir, appDir, appSysDir, containerDir, containerSysDir }) {
Assert.assertTrue(f.getAbsolutePath() + " doesn't exist!!", f.exists());
Assert.assertTrue(f.getAbsolutePath() + " is not a directory!!", f.isDirectory());
}
Assert.assertTrue(targetFile.getAbsolutePath() + " doesn't exist!!", targetFile.exists());
// Now verify the contents of the file
BufferedReader reader = new BufferedReader(new FileReader(targetFile));
Assert.assertEquals("Hello World!", reader.readLine());
Assert.assertEquals(null, reader.readLine());
}
use of org.apache.hadoop.yarn.api.records.LocalResource in project hadoop by apache.
the class TestContainerManager method testContainerLaunchFromPreviousRM.
@Test
public void testContainerLaunchFromPreviousRM() throws IOException, InterruptedException, YarnException {
containerManager.start();
ContainerLaunchContext containerLaunchContext = recordFactory.newRecordInstance(ContainerLaunchContext.class);
ContainerId cId1 = createContainerId(0);
ContainerId cId2 = createContainerId(0);
containerLaunchContext.setLocalResources(new HashMap<String, LocalResource>());
// Construct the Container with Invalid RMIdentifier
StartContainerRequest startRequest1 = StartContainerRequest.newInstance(containerLaunchContext, createContainerToken(cId1, ResourceManagerConstants.RM_INVALID_IDENTIFIER, context.getNodeId(), user, context.getContainerTokenSecretManager()));
List<StartContainerRequest> list = new ArrayList<>();
list.add(startRequest1);
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
containerManager.startContainers(allRequests);
boolean catchException = false;
try {
StartContainersResponse response = containerManager.startContainers(allRequests);
if (response.getFailedRequests().containsKey(cId1)) {
throw response.getFailedRequests().get(cId1).deSerialize();
}
} catch (Throwable e) {
e.printStackTrace();
catchException = true;
Assert.assertTrue(e.getMessage().contains("Container " + cId1 + " rejected as it is allocated by a previous RM"));
Assert.assertTrue(e.getClass().getName().equalsIgnoreCase(InvalidContainerException.class.getName()));
}
// Verify that startContainer fail because of invalid container request
Assert.assertTrue(catchException);
// Construct the Container with a RMIdentifier within current RM
StartContainerRequest startRequest2 = StartContainerRequest.newInstance(containerLaunchContext, createContainerToken(cId2, DUMMY_RM_IDENTIFIER, context.getNodeId(), user, context.getContainerTokenSecretManager()));
List<StartContainerRequest> list2 = new ArrayList<>();
list.add(startRequest2);
StartContainersRequest allRequests2 = StartContainersRequest.newInstance(list2);
containerManager.startContainers(allRequests2);
boolean noException = true;
try {
containerManager.startContainers(allRequests2);
} catch (YarnException e) {
noException = false;
}
// Verify that startContainer get no YarnException
Assert.assertTrue(noException);
}
Aggregations