Search in sources :

Example 1 with ContainerExitEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent in project hadoop by apache.

the class RecoveredContainerLaunch method call.

/**
   * Wait on the process specified in pid file and return its exit code
   */
@SuppressWarnings("unchecked")
@Override
public Integer call() {
    int retCode = ExitCode.LOST.getExitCode();
    ContainerId containerId = container.getContainerId();
    String appIdStr = containerId.getApplicationAttemptId().getApplicationId().toString();
    String containerIdStr = containerId.toString();
    dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.CONTAINER_LAUNCHED));
    boolean notInterrupted = true;
    try {
        File pidFile = locatePidFile(appIdStr, containerIdStr);
        if (pidFile != null) {
            String pidPathStr = pidFile.getPath();
            pidFilePath = new Path(pidPathStr);
            exec.activateContainer(containerId, pidFilePath);
            retCode = exec.reacquireContainer(new ContainerReacquisitionContext.Builder().setContainer(container).setUser(container.getUser()).setContainerId(containerId).build());
        } else {
            LOG.warn("Unable to locate pid file for container " + containerIdStr);
        }
    } catch (InterruptedException | InterruptedIOException e) {
        LOG.warn("Interrupted while waiting for exit code from " + containerId);
        notInterrupted = false;
    } catch (IOException e) {
        LOG.error("Unable to recover container " + containerIdStr, e);
    } finally {
        if (notInterrupted) {
            this.completed.set(true);
            exec.deactivateContainer(containerId);
            try {
                getContext().getNMStateStore().storeContainerCompleted(containerId, retCode);
            } catch (IOException e) {
                LOG.error("Unable to set exit code for container " + containerId);
            }
        }
    }
    if (retCode != 0) {
        LOG.warn("Recovered container exited with a non-zero exit code " + retCode);
        this.dispatcher.getEventHandler().handle(new ContainerExitEvent(containerId, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, retCode, "Container exited with a non-zero exit code " + retCode));
        return retCode;
    }
    LOG.info("Recovered container " + containerId + " succeeded");
    dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
    return 0;
}
Also used : ContainerEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent) Path(org.apache.hadoop.fs.Path) InterruptedIOException(java.io.InterruptedIOException) ContainerExitEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent) IOException(java.io.IOException) InterruptedIOException(java.io.InterruptedIOException) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) File(java.io.File)

Example 2 with ContainerExitEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent in project hadoop by apache.

the class ContainerLaunch method handleContainerExitWithFailure.

/**
   * Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log.
   * ErrorLog filename is not fixed and depends upon app, hence file name
   * pattern is used.
   * @param containerID
   * @param ret
   * @param containerLogDir
   * @param diagnosticInfo
   */
@SuppressWarnings("unchecked")
protected void handleContainerExitWithFailure(ContainerId containerID, int ret, Path containerLogDir, StringBuilder diagnosticInfo) {
    LOG.warn(diagnosticInfo);
    String errorFileNamePattern = conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN, YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN);
    FSDataInputStream errorFileIS = null;
    try {
        FileSystem fileSystem = FileSystem.getLocal(conf).getRaw();
        FileStatus[] errorFileStatuses = fileSystem.globStatus(new Path(containerLogDir, errorFileNamePattern));
        if (errorFileStatuses != null && errorFileStatuses.length != 0) {
            long tailSizeInBytes = conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES, YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES);
            Path errorFile = errorFileStatuses[0].getPath();
            long fileSize = errorFileStatuses[0].getLen();
            // modified file, and also append the file names in the diagnosticInfo
            if (errorFileStatuses.length > 1) {
                String[] errorFileNames = new String[errorFileStatuses.length];
                long latestModifiedTime = errorFileStatuses[0].getModificationTime();
                errorFileNames[0] = errorFileStatuses[0].getPath().getName();
                for (int i = 1; i < errorFileStatuses.length; i++) {
                    errorFileNames[i] = errorFileStatuses[i].getPath().getName();
                    if (errorFileStatuses[i].getModificationTime() > latestModifiedTime) {
                        latestModifiedTime = errorFileStatuses[i].getModificationTime();
                        errorFile = errorFileStatuses[i].getPath();
                        fileSize = errorFileStatuses[i].getLen();
                    }
                }
                diagnosticInfo.append("Error files: ").append(StringUtils.join(", ", errorFileNames)).append(".\n");
            }
            long startPosition = (fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes;
            int bufferSize = (int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes);
            byte[] tailBuffer = new byte[bufferSize];
            errorFileIS = fileSystem.open(errorFile);
            errorFileIS.readFully(startPosition, tailBuffer);
            diagnosticInfo.append("Last ").append(tailSizeInBytes).append(" bytes of ").append(errorFile.getName()).append(" :\n").append(new String(tailBuffer, StandardCharsets.UTF_8));
        }
    } catch (IOException e) {
        LOG.error("Failed to get tail of the container's error log file", e);
    } finally {
        IOUtils.cleanup(LOG, errorFileIS);
    }
    this.dispatcher.getEventHandler().handle(new ContainerExitEvent(containerID, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret, diagnosticInfo.toString()));
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ContainerExitEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IOException(java.io.IOException)

Example 3 with ContainerExitEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent in project hadoop by apache.

the class ContainerLaunch method handleContainerExitCode.

@SuppressWarnings("unchecked")
protected void handleContainerExitCode(int exitCode, Path containerLogDir) {
    ContainerId containerId = container.getContainerId();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Container " + containerId + " completed with exit code " + exitCode);
    }
    StringBuilder diagnosticInfo = new StringBuilder("Container exited with a non-zero exit code ");
    diagnosticInfo.append(exitCode);
    diagnosticInfo.append(". ");
    if (exitCode == ExitCode.FORCE_KILLED.getExitCode() || exitCode == ExitCode.TERMINATED.getExitCode()) {
        // If Container was killed before starting... NO need to do this.
        if (!killedBeforeStart) {
            dispatcher.getEventHandler().handle(new ContainerExitEvent(containerId, ContainerEventType.CONTAINER_KILLED_ON_REQUEST, exitCode, diagnosticInfo.toString()));
        }
    } else if (exitCode != 0) {
        handleContainerExitWithFailure(containerId, exitCode, containerLogDir, diagnosticInfo);
    } else {
        LOG.info("Container " + containerId + " succeeded ");
        dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.CONTAINER_EXITED_WITH_SUCCESS));
    }
}
Also used : ContainerEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ContainerExitEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent)

Example 4 with ContainerExitEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent in project hadoop by apache.

the class DummyContainerManager method createContainersLauncher.

@Override
@SuppressWarnings("unchecked")
protected ContainersLauncher createContainersLauncher(Context context, ContainerExecutor exec) {
    return new ContainersLauncher(context, super.dispatcher, exec, super.dirsHandler, this) {

        @Override
        public void handle(ContainersLauncherEvent event) {
            Container container = event.getContainer();
            ContainerId containerId = container.getContainerId();
            switch(event.getType()) {
                case LAUNCH_CONTAINER:
                    dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.CONTAINER_LAUNCHED));
                    break;
                case CLEANUP_CONTAINER:
                    dispatcher.getEventHandler().handle(new ContainerExitEvent(containerId, ContainerEventType.CONTAINER_KILLED_ON_REQUEST, 0, "Container exited with exit code 0."));
                    break;
            }
        }
    };
}
Also used : ContainerEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent) Container(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container) ContainersLauncher(org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ContainerExitEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent) ContainersLauncherEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent)

Example 5 with ContainerExitEvent

use of org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent in project hadoop by apache.

the class ContainerLaunch method call.

@Override
// dispatcher not typed
@SuppressWarnings("unchecked")
public Integer call() {
    if (!validateContainerState()) {
        return 0;
    }
    final ContainerLaunchContext launchContext = container.getLaunchContext();
    ContainerId containerID = container.getContainerId();
    String containerIdStr = containerID.toString();
    final List<String> command = launchContext.getCommands();
    int ret = -1;
    Path containerLogDir;
    try {
        Map<Path, List<String>> localResources = getLocalizedResources();
        final String user = container.getUser();
        // /////////////////////////// Variable expansion
        // Before the container script gets written out.
        List<String> newCmds = new ArrayList<String>(command.size());
        String appIdStr = app.getAppId().toString();
        String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir(appIdStr, containerIdStr);
        containerLogDir = dirsHandler.getLogPathForWrite(relativeContainerLogDir, false);
        recordContainerLogDir(containerID, containerLogDir.toString());
        for (String str : command) {
            // TODO: Should we instead work via symlinks without this grammar?
            newCmds.add(expandEnvironment(str, containerLogDir));
        }
        launchContext.setCommands(newCmds);
        Map<String, String> environment = launchContext.getEnvironment();
        // Make a copy of env to iterate & do variable expansion
        for (Entry<String, String> entry : environment.entrySet()) {
            String value = entry.getValue();
            value = expandEnvironment(value, containerLogDir);
            entry.setValue(value);
        }
        // /////////////////////////// End of variable expansion
        FileContext lfs = FileContext.getLocalFSFileContext();
        Path nmPrivateContainerScriptPath = dirsHandler.getLocalPathForWrite(getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + CONTAINER_SCRIPT);
        Path nmPrivateTokensPath = dirsHandler.getLocalPathForWrite(getContainerPrivateDir(appIdStr, containerIdStr) + Path.SEPARATOR + String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT, containerIdStr));
        Path nmPrivateClasspathJarDir = dirsHandler.getLocalPathForWrite(getContainerPrivateDir(appIdStr, containerIdStr));
        DataOutputStream containerScriptOutStream = null;
        DataOutputStream tokensOutStream = null;
        // Select the working directory for the container
        Path containerWorkDir = dirsHandler.getLocalPathForWrite(ContainerLocalizer.USERCACHE + Path.SEPARATOR + user + Path.SEPARATOR + ContainerLocalizer.APPCACHE + Path.SEPARATOR + appIdStr + Path.SEPARATOR + containerIdStr, LocalDirAllocator.SIZE_UNKNOWN, false);
        recordContainerWorkDir(containerID, containerWorkDir.toString());
        String pidFileSubpath = getPidFileSubpath(appIdStr, containerIdStr);
        // pid file should be in nm private dir so that it is not 
        // accessible by users
        pidFilePath = dirsHandler.getLocalPathForWrite(pidFileSubpath);
        List<String> localDirs = dirsHandler.getLocalDirs();
        List<String> logDirs = dirsHandler.getLogDirs();
        List<String> filecacheDirs = getNMFilecacheDirs(localDirs);
        List<String> userLocalDirs = getUserLocalDirs(localDirs);
        List<String> containerLocalDirs = getContainerLocalDirs(localDirs);
        List<String> containerLogDirs = getContainerLogDirs(logDirs);
        if (!dirsHandler.areDisksHealthy()) {
            ret = ContainerExitStatus.DISKS_FAILED;
            throw new IOException("Most of the disks failed. " + dirsHandler.getDisksHealthReport(false));
        }
        try {
            // /////////// Write out the container-script in the nmPrivate space.
            List<Path> appDirs = new ArrayList<Path>(localDirs.size());
            for (String localDir : localDirs) {
                Path usersdir = new Path(localDir, ContainerLocalizer.USERCACHE);
                Path userdir = new Path(usersdir, user);
                Path appsdir = new Path(userdir, ContainerLocalizer.APPCACHE);
                appDirs.add(new Path(appsdir, appIdStr));
            }
            containerScriptOutStream = lfs.create(nmPrivateContainerScriptPath, EnumSet.of(CREATE, OVERWRITE));
            // Set the token location too.
            environment.put(ApplicationConstants.CONTAINER_TOKEN_FILE_ENV_NAME, new Path(containerWorkDir, FINAL_CONTAINER_TOKENS_FILE).toUri().getPath());
            // Sanitize the container's environment
            sanitizeEnv(environment, containerWorkDir, appDirs, userLocalDirs, containerLogDirs, localResources, nmPrivateClasspathJarDir);
            exec.prepareContainer(new ContainerPrepareContext.Builder().setContainer(container).setLocalizedResources(localResources).setUser(user).setContainerLocalDirs(containerLocalDirs).setCommands(launchContext.getCommands()).build());
            // Write out the environment
            exec.writeLaunchEnv(containerScriptOutStream, environment, localResources, launchContext.getCommands(), new Path(containerLogDirs.get(0)), user);
            // /////////// End of writing out container-script
            // /////////// Write out the container-tokens in the nmPrivate space.
            tokensOutStream = lfs.create(nmPrivateTokensPath, EnumSet.of(CREATE, OVERWRITE));
            Credentials creds = container.getCredentials();
            creds.writeTokenStorageToStream(tokensOutStream);
        // /////////// End of writing out container-tokens
        } finally {
            IOUtils.cleanup(LOG, containerScriptOutStream, tokensOutStream);
        }
        ret = launchContainer(new ContainerStartContext.Builder().setContainer(container).setLocalizedResources(localResources).setNmPrivateContainerScriptPath(nmPrivateContainerScriptPath).setNmPrivateTokensPath(nmPrivateTokensPath).setUser(user).setAppId(appIdStr).setContainerWorkDir(containerWorkDir).setLocalDirs(localDirs).setLogDirs(logDirs).setFilecacheDirs(filecacheDirs).setUserLocalDirs(userLocalDirs).setContainerLocalDirs(containerLocalDirs).setContainerLogDirs(containerLogDirs).build());
    } catch (Throwable e) {
        LOG.warn("Failed to launch container.", e);
        dispatcher.getEventHandler().handle(new ContainerExitEvent(containerID, ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret, e.getMessage()));
        return ret;
    } finally {
        setContainerCompletedStatus(ret);
    }
    handleContainerExitCode(ret, containerLogDir);
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) ContainerPrepareContext(org.apache.hadoop.yarn.server.nodemanager.executor.ContainerPrepareContext) DataOutputStream(java.io.DataOutputStream) ContainerExitEvent(org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent) ArrayList(java.util.ArrayList) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) IOException(java.io.IOException) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) List(java.util.List) ArrayList(java.util.ArrayList) FileContext(org.apache.hadoop.fs.FileContext) Credentials(org.apache.hadoop.security.Credentials)

Aggregations

ContainerExitEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent)8 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)6 Path (org.apache.hadoop.fs.Path)5 ContainerEvent (org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent)5 IOException (java.io.IOException)4 List (java.util.List)3 ContainerLaunchContext (org.apache.hadoop.yarn.api.records.ContainerLaunchContext)3 Container (org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container)3 ArrayList (java.util.ArrayList)2 Configuration (org.apache.hadoop.conf.Configuration)2 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)2 Dispatcher (org.apache.hadoop.yarn.event.Dispatcher)2 Event (org.apache.hadoop.yarn.event.Event)2 EventHandler (org.apache.hadoop.yarn.event.EventHandler)2 BaseContainerManagerTest (org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest)2 Test (org.junit.Test)2 DataOutputStream (java.io.DataOutputStream)1 File (java.io.File)1 InterruptedIOException (java.io.InterruptedIOException)1 HashMap (java.util.HashMap)1