use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent in project hadoop by apache.
the class TestDefaultContainerExecutor method testContainerLaunchError.
@Test
public void testContainerLaunchError() throws IOException, InterruptedException {
if (Shell.WINDOWS) {
BASE_TMP_PATH = new Path(new File("target").getAbsolutePath(), TestDefaultContainerExecutor.class.getSimpleName());
}
Path localDir = new Path(BASE_TMP_PATH, "localDir");
List<String> localDirs = new ArrayList<String>();
localDirs.add(localDir.toString());
List<String> logDirs = new ArrayList<String>();
Path logDir = new Path(BASE_TMP_PATH, "logDir");
logDirs.add(logDir.toString());
Configuration conf = new Configuration();
conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "077");
conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir.toString());
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir.toString());
FileContext lfs = FileContext.getLocalFSFileContext(conf);
DefaultContainerExecutor mockExec = spy(new DefaultContainerExecutor(lfs));
mockExec.setConf(conf);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
String diagnostics = (String) invocationOnMock.getArguments()[0];
assertTrue("Invalid Diagnostics message: " + diagnostics, diagnostics.contains("No such file or directory"));
return null;
}
}).when(mockExec).logOutput(any(String.class));
String appSubmitter = "nobody";
String appId = "APP_ID";
String containerId = "CONTAINER_ID";
Container container = mock(Container.class);
ContainerId cId = mock(ContainerId.class);
ContainerLaunchContext context = mock(ContainerLaunchContext.class);
HashMap<String, String> env = new HashMap<String, String>();
env.put("LANG", "C");
when(container.getContainerId()).thenReturn(cId);
when(container.getLaunchContext()).thenReturn(context);
try {
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
ContainerDiagnosticsUpdateEvent event = (ContainerDiagnosticsUpdateEvent) invocationOnMock.getArguments()[0];
assertTrue("Invalid Diagnostics message: " + event.getDiagnosticsUpdate(), event.getDiagnosticsUpdate().contains("No such file or directory"));
return null;
}
}).when(container).handle(any(ContainerDiagnosticsUpdateEvent.class));
when(cId.toString()).thenReturn(containerId);
when(cId.getApplicationAttemptId()).thenReturn(ApplicationAttemptId.newInstance(ApplicationId.newInstance(0, 1), 0));
when(context.getEnvironment()).thenReturn(env);
mockExec.createUserLocalDirs(localDirs, appSubmitter);
mockExec.createUserCacheDirs(localDirs, appSubmitter);
mockExec.createAppDirs(localDirs, appSubmitter, appId);
mockExec.createAppLogDirs(appId, logDirs, appSubmitter);
Path scriptPath = new Path("file:///bin/echo");
Path tokensPath = new Path("file:///dev/null");
if (Shell.WINDOWS) {
File tmp = new File(BASE_TMP_PATH.toString(), "test_echo.cmd");
BufferedWriter output = new BufferedWriter(new FileWriter(tmp));
output.write("Exit 1");
output.write("Echo No such file or directory 1>&2");
output.close();
scriptPath = new Path(tmp.getAbsolutePath());
tmp = new File(BASE_TMP_PATH.toString(), "tokens");
tmp.createNewFile();
tokensPath = new Path(tmp.getAbsolutePath());
}
Path workDir = localDir;
Path pidFile = new Path(workDir, "pid.txt");
mockExec.init();
mockExec.activateContainer(cId, pidFile);
int ret = mockExec.launchContainer(new ContainerStartContext.Builder().setContainer(container).setNmPrivateContainerScriptPath(scriptPath).setNmPrivateTokensPath(tokensPath).setUser(appSubmitter).setAppId(appId).setContainerWorkDir(workDir).setLocalDirs(localDirs).setLogDirs(logDirs).build());
Assert.assertNotSame(0, ret);
} finally {
mockExec.deleteAsUser(new DeletionAsUserContext.Builder().setUser(appSubmitter).setSubDir(localDir).build());
mockExec.deleteAsUser(new DeletionAsUserContext.Builder().setUser(appSubmitter).setSubDir(logDir).build());
}
}
use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent in project hadoop by apache.
the class LinuxContainerExecutor method launchContainer.
@Override
public int launchContainer(ContainerStartContext ctx) throws IOException {
Container container = ctx.getContainer();
Path nmPrivateContainerScriptPath = ctx.getNmPrivateContainerScriptPath();
Path nmPrivateTokensPath = ctx.getNmPrivateTokensPath();
String user = ctx.getUser();
String appId = ctx.getAppId();
Path containerWorkDir = ctx.getContainerWorkDir();
List<String> localDirs = ctx.getLocalDirs();
List<String> logDirs = ctx.getLogDirs();
List<String> filecacheDirs = ctx.getFilecacheDirs();
List<String> userLocalDirs = ctx.getUserLocalDirs();
List<String> containerLocalDirs = ctx.getContainerLocalDirs();
List<String> containerLogDirs = ctx.getContainerLogDirs();
Map<Path, List<String>> localizedResources = ctx.getLocalizedResources();
verifyUsernamePattern(user);
String runAsUser = getRunAsUser(user);
ContainerId containerId = container.getContainerId();
String containerIdStr = containerId.toString();
resourcesHandler.preExecute(containerId, container.getResource());
String resourcesOptions = resourcesHandler.getResourcesOption(containerId);
String tcCommandFile = null;
try {
if (resourceHandlerChain != null) {
List<PrivilegedOperation> ops = resourceHandlerChain.preStart(container);
if (ops != null) {
List<PrivilegedOperation> resourceOps = new ArrayList<>();
resourceOps.add(new PrivilegedOperation(PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, resourcesOptions));
for (PrivilegedOperation op : ops) {
switch(op.getOperationType()) {
case ADD_PID_TO_CGROUP:
resourceOps.add(op);
break;
case TC_MODIFY_STATE:
tcCommandFile = op.getArguments().get(0);
break;
default:
LOG.warn("PrivilegedOperation type unsupported in launch: " + op.getOperationType());
}
}
if (resourceOps.size() > 1) {
//squash resource operations
try {
PrivilegedOperation operation = PrivilegedOperationExecutor.squashCGroupOperations(resourceOps);
resourcesOptions = operation.getArguments().get(0);
} catch (PrivilegedOperationException e) {
LOG.error("Failed to squash cgroup operations!", e);
throw new ResourceHandlerException("Failed to squash cgroup operations!");
}
}
}
}
} catch (ResourceHandlerException e) {
LOG.error("ResourceHandlerChain.preStart() failed!", e);
throw new IOException("ResourceHandlerChain.preStart() failed!", e);
}
try {
Path pidFilePath = getPidFilePath(containerId);
if (pidFilePath != null) {
List<String> prefixCommands = new ArrayList<>();
ContainerRuntimeContext.Builder builder = new ContainerRuntimeContext.Builder(container);
addSchedPriorityCommand(prefixCommands);
if (prefixCommands.size() > 0) {
builder.setExecutionAttribute(CONTAINER_LAUNCH_PREFIX_COMMANDS, prefixCommands);
}
builder.setExecutionAttribute(LOCALIZED_RESOURCES, localizedResources).setExecutionAttribute(RUN_AS_USER, runAsUser).setExecutionAttribute(USER, user).setExecutionAttribute(APPID, appId).setExecutionAttribute(CONTAINER_ID_STR, containerIdStr).setExecutionAttribute(CONTAINER_WORK_DIR, containerWorkDir).setExecutionAttribute(NM_PRIVATE_CONTAINER_SCRIPT_PATH, nmPrivateContainerScriptPath).setExecutionAttribute(NM_PRIVATE_TOKENS_PATH, nmPrivateTokensPath).setExecutionAttribute(PID_FILE_PATH, pidFilePath).setExecutionAttribute(LOCAL_DIRS, localDirs).setExecutionAttribute(LOG_DIRS, logDirs).setExecutionAttribute(FILECACHE_DIRS, filecacheDirs).setExecutionAttribute(USER_LOCAL_DIRS, userLocalDirs).setExecutionAttribute(CONTAINER_LOCAL_DIRS, containerLocalDirs).setExecutionAttribute(CONTAINER_LOG_DIRS, containerLogDirs).setExecutionAttribute(RESOURCES_OPTIONS, resourcesOptions);
if (tcCommandFile != null) {
builder.setExecutionAttribute(TC_COMMAND_FILE, tcCommandFile);
}
linuxContainerRuntime.launchContainer(builder.build());
} else {
LOG.info("Container was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
} catch (ContainerExecutionException e) {
int exitCode = e.getExitCode();
LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
// output
if (exitCode != ExitCode.FORCE_KILLED.getExitCode() && exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch with container ID: " + containerId + " and exit code: " + exitCode, e);
StringBuilder builder = new StringBuilder();
builder.append("Exception from container-launch.\n");
builder.append("Container id: " + containerId + "\n");
builder.append("Exit code: " + exitCode + "\n");
if (!Optional.fromNullable(e.getErrorOutput()).or("").isEmpty()) {
builder.append("Exception message: " + e.getErrorOutput() + "\n");
}
builder.append("Stack trace: " + StringUtils.stringifyException(e) + "\n");
if (!e.getOutput().isEmpty()) {
builder.append("Shell output: " + e.getOutput() + "\n");
}
String diagnostics = builder.toString();
logOutput(diagnostics);
container.handle(new ContainerDiagnosticsUpdateEvent(containerId, diagnostics));
} else {
container.handle(new ContainerDiagnosticsUpdateEvent(containerId, "Container killed on request. Exit code is " + exitCode));
}
return exitCode;
} finally {
resourcesHandler.postExecute(containerId);
try {
if (resourceHandlerChain != null) {
resourceHandlerChain.postComplete(containerId);
}
} catch (ResourceHandlerException e) {
LOG.warn("ResourceHandlerChain.postComplete failed for " + "containerId: " + containerId + ". Exception: " + e);
}
}
return 0;
}
use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent in project hadoop by apache.
the class ContainerLaunch method signalContainer.
/**
* Send a signal to the container.
*
*
* @throws IOException
*/
// dispatcher not typed
@SuppressWarnings("unchecked")
public void signalContainer(SignalContainerCommand command) throws IOException {
ContainerId containerId = container.getContainerTokenIdentifier().getContainerID();
String containerIdStr = containerId.toString();
String user = container.getUser();
Signal signal = translateCommandToSignal(command);
if (signal.equals(Signal.NULL)) {
LOG.info("ignore signal command " + command);
return;
}
LOG.info("Sending signal " + command + " to container " + containerIdStr);
boolean alreadyLaunched = !containerAlreadyLaunched.compareAndSet(false, true);
if (!alreadyLaunched) {
LOG.info("Container " + containerIdStr + " not launched." + " Not sending the signal");
return;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Getting pid for container " + containerIdStr + " to send signal to from pid file " + (pidFilePath != null ? pidFilePath.toString() : "null"));
}
try {
// get process id from pid file if available
// else if shell is still active, get it from the shell
String processId = null;
if (pidFilePath != null) {
processId = getContainerPid(pidFilePath);
}
if (processId != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sending signal to pid " + processId + " as user " + user + " for container " + containerIdStr);
}
boolean result = exec.signalContainer(new ContainerSignalContext.Builder().setContainer(container).setUser(user).setPid(processId).setSignal(signal).build());
String diagnostics = "Sent signal " + command + " (" + signal + ") to pid " + processId + " as user " + user + " for container " + containerIdStr + ", result=" + (result ? "success" : "failed");
LOG.info(diagnostics);
dispatcher.getEventHandler().handle(new ContainerDiagnosticsUpdateEvent(containerId, diagnostics));
}
} catch (Exception e) {
String message = "Exception when sending signal to container " + containerIdStr + ": " + StringUtils.stringifyException(e);
LOG.warn(message);
}
}
use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent in project hadoop by apache.
the class DefaultContainerExecutor method launchContainer.
@Override
public int launchContainer(ContainerStartContext ctx) throws IOException {
Container container = ctx.getContainer();
Path nmPrivateContainerScriptPath = ctx.getNmPrivateContainerScriptPath();
Path nmPrivateTokensPath = ctx.getNmPrivateTokensPath();
String user = ctx.getUser();
Path containerWorkDir = ctx.getContainerWorkDir();
List<String> localDirs = ctx.getLocalDirs();
List<String> logDirs = ctx.getLogDirs();
FsPermission dirPerm = new FsPermission(APPDIR_PERM);
ContainerId containerId = container.getContainerId();
// create container dirs on all disks
String containerIdStr = containerId.toString();
String appIdStr = containerId.getApplicationAttemptId().getApplicationId().toString();
for (String sLocalDir : localDirs) {
Path usersdir = new Path(sLocalDir, ContainerLocalizer.USERCACHE);
Path userdir = new Path(usersdir, user);
Path appCacheDir = new Path(userdir, ContainerLocalizer.APPCACHE);
Path appDir = new Path(appCacheDir, appIdStr);
Path containerDir = new Path(appDir, containerIdStr);
createDir(containerDir, dirPerm, true, user);
}
// Create the container log-dirs on all disks
createContainerLogDirs(appIdStr, containerIdStr, logDirs, user);
Path tmpDir = new Path(containerWorkDir, YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR);
createDir(tmpDir, dirPerm, false, user);
// copy container tokens to work dir
Path tokenDst = new Path(containerWorkDir, ContainerLaunch.FINAL_CONTAINER_TOKENS_FILE);
copyFile(nmPrivateTokensPath, tokenDst, user);
// copy launch script to work dir
Path launchDst = new Path(containerWorkDir, ContainerLaunch.CONTAINER_SCRIPT);
copyFile(nmPrivateContainerScriptPath, launchDst, user);
// Create new local launch wrapper script
LocalWrapperScriptBuilder sb = getLocalWrapperScriptBuilder(containerIdStr, containerWorkDir);
// Windows path length limitation.
if (Shell.WINDOWS && sb.getWrapperScriptPath().toString().length() > WIN_MAX_PATH) {
throw new IOException(String.format("Cannot launch container using script at path %s, because it exceeds " + "the maximum supported path length of %d characters. Consider " + "configuring shorter directories in %s.", sb.getWrapperScriptPath(), WIN_MAX_PATH, YarnConfiguration.NM_LOCAL_DIRS));
}
Path pidFile = getPidFilePath(containerId);
if (pidFile != null) {
sb.writeLocalWrapperScript(launchDst, pidFile);
} else {
LOG.info("Container " + containerIdStr + " was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
// create log dir under app
// fork script
Shell.CommandExecutor shExec = null;
try {
setScriptExecutable(launchDst, user);
setScriptExecutable(sb.getWrapperScriptPath(), user);
shExec = buildCommandExecutor(sb.getWrapperScriptPath().toString(), containerIdStr, user, pidFile, container.getResource(), new File(containerWorkDir.toUri().getPath()), container.getLaunchContext().getEnvironment());
if (isContainerActive(containerId)) {
shExec.execute();
} else {
LOG.info("Container " + containerIdStr + " was marked as inactive. Returning terminated error");
return ExitCode.TERMINATED.getExitCode();
}
} catch (IOException e) {
if (null == shExec) {
return -1;
}
int exitCode = shExec.getExitCode();
LOG.warn("Exit code from container " + containerId + " is : " + exitCode);
// container-executor's output
if (exitCode != ExitCode.FORCE_KILLED.getExitCode() && exitCode != ExitCode.TERMINATED.getExitCode()) {
LOG.warn("Exception from container-launch with container ID: " + containerId + " and exit code: " + exitCode, e);
StringBuilder builder = new StringBuilder();
builder.append("Exception from container-launch.\n");
builder.append("Container id: ").append(containerId).append("\n");
builder.append("Exit code: ").append(exitCode).append("\n");
if (!Optional.fromNullable(e.getMessage()).or("").isEmpty()) {
builder.append("Exception message: ");
builder.append(e.getMessage()).append("\n");
}
builder.append("Stack trace: ");
builder.append(StringUtils.stringifyException(e)).append("\n");
if (!shExec.getOutput().isEmpty()) {
builder.append("Shell output: ");
builder.append(shExec.getOutput()).append("\n");
}
String diagnostics = builder.toString();
logOutput(diagnostics);
container.handle(new ContainerDiagnosticsUpdateEvent(containerId, diagnostics));
} else {
container.handle(new ContainerDiagnosticsUpdateEvent(containerId, "Container killed on request. Exit code is " + exitCode));
}
return exitCode;
} finally {
if (shExec != null)
shExec.close();
}
return 0;
}
use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent in project hadoop by apache.
the class ContainerLaunch method cleanupContainer.
/**
* Cleanup the container.
* Cancels the launch if launch has not started yet or signals
* the executor to not execute the process if not already done so.
* Also, sends a SIGTERM followed by a SIGKILL to the process if
* the process id is available.
* @throws IOException
*/
// dispatcher not typed
@SuppressWarnings("unchecked")
public void cleanupContainer() throws IOException {
ContainerId containerId = container.getContainerId();
String containerIdStr = containerId.toString();
LOG.info("Cleaning up container " + containerIdStr);
try {
context.getNMStateStore().storeContainerKilled(containerId);
} catch (IOException e) {
LOG.error("Unable to mark container " + containerId + " killed in store", e);
}
// launch flag will be set to true if process already launched
boolean alreadyLaunched = !containerAlreadyLaunched.compareAndSet(false, true);
if (!alreadyLaunched) {
LOG.info("Container " + containerIdStr + " not launched." + " No cleanup needed to be done");
return;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Marking container " + containerIdStr + " as inactive");
}
// this should ensure that if the container process has not launched
// by this time, it will never be launched
exec.deactivateContainer(containerId);
if (LOG.isDebugEnabled()) {
LOG.debug("Getting pid for container " + containerIdStr + " to kill" + " from pid file " + (pidFilePath != null ? pidFilePath.toString() : "null"));
}
// however the container process may have already started
try {
// get process id from pid file if available
// else if shell is still active, get it from the shell
String processId = null;
if (pidFilePath != null) {
processId = getContainerPid(pidFilePath);
}
// kill process
if (processId != null) {
String user = container.getUser();
if (LOG.isDebugEnabled()) {
LOG.debug("Sending signal to pid " + processId + " as user " + user + " for container " + containerIdStr);
}
final Signal signal = sleepDelayBeforeSigKill > 0 ? Signal.TERM : Signal.KILL;
boolean result = exec.signalContainer(new ContainerSignalContext.Builder().setContainer(container).setUser(user).setPid(processId).setSignal(signal).build());
if (LOG.isDebugEnabled()) {
LOG.debug("Sent signal " + signal + " to pid " + processId + " as user " + user + " for container " + containerIdStr + ", result=" + (result ? "success" : "failed"));
}
if (sleepDelayBeforeSigKill > 0) {
new DelayedProcessKiller(container, user, processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start();
}
}
} catch (Exception e) {
String message = "Exception when trying to cleanup container " + containerIdStr + ": " + StringUtils.stringifyException(e);
LOG.warn(message);
dispatcher.getEventHandler().handle(new ContainerDiagnosticsUpdateEvent(containerId, message));
} finally {
// cleanup pid file if present
if (pidFilePath != null) {
FileContext lfs = FileContext.getLocalFSFileContext();
lfs.delete(pidFilePath, false);
lfs.delete(pidFilePath.suffix(EXIT_CODE_FILE_SUFFIX), false);
}
}
}
Aggregations