Search in sources :

Example 1 with CommandLineBuilder

use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.

the class BatchJobInfrastructure method handleFailedDeployment.

/**
 * Creates a lost node to notify the user that the deployment
 * has failed because of an error
 *
 * @param clb
 * @param e
 *            the error that caused the deployment to failed.
 * @throws RMException
 */
private void handleFailedDeployment(CommandLineBuilder clb, Throwable e) throws RMException {
    String error = getStackTraceAsString(e);
    String command = null;
    try {
        command = clb.buildCommandLine(false);
    } catch (Exception ex) {
        command = "Cannot determine the command used to start the node.";
    }
    String lostNode = super.addDeployingNode(clb.getNodeName(), command, "Cannot deploy the node because of an error:" + System.lineSeparator() + error, 60000);
    super.declareDeployingNodeLost(lostNode, null);
    throw new RMException("The deployment failed because of an error", e);
}
Also used : Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) KeyException(java.security.KeyException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) RMException(org.ow2.proactive.resourcemanager.exception.RMException)

Example 2 with CommandLineBuilder

use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.

the class InfrastructureManager method getDefaultCommandLineBuilder.

// **********************************************************************************************\\
// **************************************** API methods
// *****************************************\\
// **********************************************************************************************\\
/**
 * This method returns a
 * {@link org.ow2.proactive.resourcemanager.utils.CommandLineBuilder} filled
 * in with "default" settings. That means that the returned
 * CommandLineBuilder is useable as such.
 * <ul>
 * <li>It tries to set the Java Path to use, either JAVA_HOME retrieved from
 * your environment or java.home set by Java itself.</li>
 * <li>The target operating system is set to {@link OperatingSystem#UNIX}
 * </li>
 * <li>If a ProActive configuration file is provided, it is used as such.
 * </li>
 * <li>Finally, it tries to set the nodesource's name, the rm's URL and the
 * node's name.</li>
 * </ul>
 *
 * @param targetOS
 *            the operating system on which one the node will be deployed
 */
protected final CommandLineBuilder getDefaultCommandLineBuilder(OperatingSystem targetOS) {
    CommandLineBuilder result = new CommandLineBuilder();
    String javaPath = System.getProperty("java.home") + targetOS.fs + "bin" + targetOS.fs + "java";
    result.setJavaPath(javaPath);
    result.setTargetOS(targetOS);
    if (CentralPAPropertyRepository.PA_CONFIGURATION_FILE.isSet()) {
        try {
            result.setPaProperties(new File(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getValue()));
        } catch (IOException e) {
            logger.debug("Cannot set default pa configuration file for " + CommandLineBuilder.class.getSimpleName(), e);
        }
    }
    result.setRmURL(getRmUrl());
    if (this.nodeSource != null) {
        String nsName = this.nodeSource.getName();
        result.setSourceName(nsName);
        result.setNodeName(nsName + "_DefaultNodeName");
    }
    return result;
}
Also used : CommandLineBuilder(org.ow2.proactive.resourcemanager.utils.CommandLineBuilder) IOException(java.io.IOException) File(java.io.File)

Example 3 with CommandLineBuilder

use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.

the class BatchJobInfrastructure method startNode.

/**
 * Builds the command line to execute on the PBS frontend and wait for every
 * launched nodes to register. If the node doesn't register (ie. runs
 * {@link #internalRegisterAcquiredNode(Node)} isn't called) before the
 * timeout (configurable) value, an exception is raised. If the qSub command
 * submitted to the PBS frontend fails, the node supposed to be launched is
 * not expected anymore and will be discarded at registration time.
 *
 * @throws RMException
 */
private void startNode() throws RMException {
    CommandLineBuilder clb = new CommandLineBuilder();
    // generate the node name
    // current rmcore shortID should be added to ensure uniqueness
    String nodeName = getBatchinJobSystemName() + "-" + nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
    clb.setNodeName(nodeName);
    clb.setJavaPath(this.javaPath);
    clb.setRmURL(getRmUrl());
    clb.setRmHome(this.schedulingPath);
    clb.setSourceName(this.nodeSource.getName());
    clb.setPaProperties(this.javaOptions);
    try {
        clb.setCredentialsValueAndNullOthers(new String(getCredentials().getBase64()));
    } catch (KeyException e) {
        this.handleFailedDeployment(clb, e);
    }
    InetAddress host = null;
    try {
        host = InetAddress.getByName(this.serverName);
    } catch (UnknownHostException e) {
        this.handleFailedDeployment(clb, e);
    }
    String deleteCmd = getDeleteJobCommand();
    String submitCmd = getSubmitJobCommand();
    // build the command: echo "script.sh params"|qsub params
    String cmd = null;
    String obfuscatedCmd = null;
    try {
        cmd = "echo \\\"" + clb.buildCommandLine(true).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
        obfuscatedCmd = "echo \\\"" + clb.buildCommandLine(false).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
    } catch (IOException e) {
        this.handleFailedDeployment(clb, e);
    }
    // add an deploying node.
    final String dnURL = super.addDeployingNode(nodeName, obfuscatedCmd, "Deploying node on " + getBatchinJobSystemName() + " scheduler", this.nodeTimeOut);
    putPnTimeout(dnURL, Boolean.FALSE);
    // executing the command
    Process p;
    try {
        p = Utils.runSSHCommand(host, cmd, this.sshOptions);
    } catch (IOException e1) {
        throw new RMException("Cannot execute ssh command: " + cmd + " on host: " + this.serverName, e1);
    }
    // recover the Job ID through stdout
    String id = "";
    InputStream in = p.getInputStream();
    int b = -1;
    try {
        while ((b = in.read()) > -1) {
            id += (char) b;
        }
    } catch (IOException e) {
    }
    // check for registration
    // at this point, the ssh process should have already exited because it
    // only handle the job submission, not the execution... furthermore
    // the "id" is defined
    String lf = System.lineSeparator();
    final long timeout = nodeTimeOut;
    long t1 = System.currentTimeMillis();
    // Hack. SSHClient fails but qSub succeeds.
    boolean isJobIDValid = false;
    // Tries to wait for this node
    // registration...
    int circuitBreakerThreshold = 5;
    while (!getPnTimeout(dnURL) && circuitBreakerThreshold > 0) {
        try {
            int exitCode = p.exitValue();
            if (exitCode != 0 && !isJobIDValid) {
                logger.warn("SSH subprocess at " + host.getHostName() + " exit code != 0 but IM tries to recover from this error...Current submit command's output: " + id + " and associated node's name: " + nodeName);
                String extractedID = this.extractSubmitOutput(id);
                String errput = this.extractProcessErrput(p);
                final String description = "SSH command failed to launch node on " + getBatchinJobSystemName() + " scheduler" + lf + "   >Error code: " + exitCode + lf + "   >Errput: " + errput + "   >Output: " + id;
                // registration...
                if (extractedID != null && !extractedID.equals("")) {
                    isJobIDValid = true;
                }
                // defines how to recover from this state
                // throws a RMException if we can't
                handleWrongJobTermination(isJobIDValid, nodeName, dnURL, host, id, description, exitCode, submitCmd, deleteCmd);
            }
        } catch (IllegalThreadStateException e) {
            // process has not returned yet
            logger.trace("Waiting for ssh process to exit in BatchJobInfrastructure");
        }
        if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
            // registration is ok
            p.destroy();
            addNodeAndDecrementDeployingNode(nodeName, this.extractSubmitOutput(id));
            return;
        }
        try {
            logger.debug("Waiting for node " + nodeName + " registration... time to timeout: " + (timeout - (System.currentTimeMillis() - t1)));
            Thread.sleep(BatchJobInfrastructure.NODE_ACQUISITION_CHECK_RATE);
        } catch (Exception e) {
            circuitBreakerThreshold--;
            logger.error("While monitoring ssh subprocess.", e);
        }
    }
    // end of while loop, either deploying node timeout/removed of
    // threshold reached
    // the node is not expected anymore
    atomicRemovePnTimeoutAndJob(nodeName, dnURL, p, id);
    if (circuitBreakerThreshold <= 0) {
        logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
        throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
    }
    // if we are here we reached an invalid state
    throw new RMException("Invalid state, exit from a control loop with threshold > 0 and expected deploying node");
}
Also used : UnknownHostException(java.net.UnknownHostException) InputStream(java.io.InputStream) CommandLineBuilder(org.ow2.proactive.resourcemanager.utils.CommandLineBuilder) Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) IOException(java.io.IOException) KeyException(java.security.KeyException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) KeyException(java.security.KeyException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) InetAddress(java.net.InetAddress)

Example 4 with CommandLineBuilder

use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.

the class LocalInfrastructure method startNodeProcess.

private void startNodeProcess(int numberOfNodes) {
    int currentIndex = getIndexAndIncrement();
    String baseNodeName = "local-" + this.nodeSource.getName() + "-" + currentIndex;
    OperatingSystem os = OperatingSystem.UNIX;
    // assuming no cygwin, windows or the "others"...
    if (System.getProperty("os.name").contains("Windows")) {
        os = OperatingSystem.WINDOWS;
    }
    String rmHome = PAResourceManagerProperties.RM_HOME.getValueAsString();
    if (!rmHome.endsWith(os.fs)) {
        rmHome += os.fs;
    }
    CommandLineBuilder clb = this.getDefaultCommandLineBuilder(os);
    // RM_Home set in bin/unix/env script
    clb.setRmHome(rmHome);
    ArrayList<String> paPropList = new ArrayList<>();
    if (!this.paProperties.contains(CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getName())) {
        paPropList.add(CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getCmdLine() + rmHome + "config" + os.fs + "security.java.policy-client");
    }
    if (!this.paProperties.contains(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getName())) {
        paPropList.add(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getCmdLine() + rmHome + "config" + os.fs + "network" + os.fs + "node.ini");
    }
    if (!this.paProperties.contains(PAResourceManagerProperties.RM_HOME.getKey())) {
        paPropList.add(PAResourceManagerProperties.RM_HOME.getCmdLine() + rmHome);
    }
    if (!this.paProperties.contains("java.library.path")) {
        paPropList.add("-Djava.library.path=" + System.getProperty("java.library.path"));
    }
    if (!paProperties.isEmpty()) {
        Collections.addAll(paPropList, this.paProperties.split(" "));
    }
    clb.setPaProperties(paPropList);
    clb.setNodeName(baseNodeName);
    clb.setNumberOfNodes(numberOfNodes);
    try {
        clb.setCredentialsValueAndNullOthers(new String(this.credentials.getBase64()));
    } catch (KeyException e) {
        createLostNodes(baseNodeName, numberOfNodes, "Cannot decrypt credentials value", e);
        return;
    }
    List<String> cmd;
    try {
        cmd = clb.buildCommandLineAsList(false);
    } catch (IOException e) {
        createLostNodes(baseNodeName, numberOfNodes, "Cannot build command line", e);
        return;
    }
    // The printed cmd with obfuscated credentials
    final String obfuscatedCmd = Joiner.on(' ').join(cmd);
    List<String> depNodeURLs = new ArrayList<>(numberOfNodes);
    final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(baseNodeName, numberOfNodes);
    ProcessExecutor processExecutor = null;
    try {
        depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, obfuscatedCmd, "Node launched locally", this.nodeTimeout));
        // Deobfuscate the cred value
        Collections.replaceAll(cmd, CommandLineBuilder.OBFUSC, clb.getCredentialsValue());
        processExecutor = new ProcessExecutor(baseNodeName, cmd, false, true);
        processExecutor.start();
        processExecutors.put(processExecutor, depNodeURLs);
        final ProcessExecutor tmpProcessExecutor = processExecutor;
        Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {

            @Override
            public void run() {
                if (tmpProcessExecutor != null && !tmpProcessExecutor.isProcessFinished()) {
                    tmpProcessExecutor.killProcess();
                }
            }
        }));
        logger.info("Local Nodes command started : " + obfuscatedCmd);
    } catch (IOException e) {
        String lf = System.lineSeparator();
        String mess = "Cannot launch rm node " + baseNodeName + lf + Throwables.getStackTraceAsString(e);
        multipleDeclareDeployingNodeLost(depNodeURLs, mess);
        if (processExecutor != null) {
            cleanProcess(processExecutor);
        }
        return;
    }
    // watching process
    int threshold = 10;
    while (!allNodesAcquiredOrLost()) {
        if (processExecutor.isProcessFinished()) {
            int exit = processExecutor.getExitCode();
            if (exit != 0) {
                String lf = System.lineSeparator();
                String message = "RMNode exit code == " + exit + lf;
                message += "Command: " + obfuscatedCmd + lf;
                String out = Joiner.on('\n').join(processExecutor.getOutput());
                String err = Joiner.on('\n').join(processExecutor.getErrorOutput());
                message += "stdout: " + out + lf + "stderr: " + err;
                multipleDeclareDeployingNodeLost(depNodeURLs, message);
            }
        } else {
            logger.debug("Waiting for nodes " + baseNodeName + " acquisition");
        }
        try {
            Thread.sleep(500);
        } catch (InterruptedException e) {
            logger.warn("Interrupted while waiting for local process status", e);
            threshold--;
            if (threshold <= 0) {
                break;
            }
        }
    }
    logger.debug("Local Infrastructure manager exits watching loop for nodes " + baseNodeName);
    logNodeOutput(baseNodeName + " stdout: ", processExecutor.getOutput());
    logNodeOutput(baseNodeName + " stderr: ", processExecutor.getErrorOutput());
    if (allNodesLost(numberOfNodes)) {
        // clean up the process
        cleanProcess(processExecutor);
    }
}
Also used : OperatingSystem(org.ow2.proactive.resourcemanager.utils.OperatingSystem) ArrayList(java.util.ArrayList) CommandLineBuilder(org.ow2.proactive.resourcemanager.utils.CommandLineBuilder) IOException(java.io.IOException) ProcessExecutor(org.ow2.proactive.process.ProcessExecutor) KeyException(java.security.KeyException)

Example 5 with CommandLineBuilder

use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.

the class SSHInfrastructure method startNodeImpl.

/**
 * Internal node acquisition method
 * <p>
 * Starts a PA runtime on remote host using SSH, register it manually in the
 * nodesource.
 *
 * @param hostTracker The host on which one the node will be started
 * @param nbNodes number of nodes to deploy
 * @param depNodeURLs list of deploying or lost nodes urls created
 * @throws RMException
 *             acquisition failed
 */
protected void startNodeImpl(HostTracker hostTracker, int nbNodes, final List<String> depNodeURLs) throws RMException {
    String fs = getTargetOSObj().fs;
    CommandLineBuilder clb = super.getDefaultCommandLineBuilder(getTargetOSObj());
    // we take care of spaces in java path
    clb.setJavaPath(this.javaPath);
    // we set the rm.home prop
    clb.setRmHome(schedulingPath);
    // we set the java security policy file
    StringBuilder sb = new StringBuilder();
    final boolean containsSpace = schedulingPath.contains(" ");
    String securitycmd = CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getCmdLine();
    if (!this.javaOptions.contains(securitycmd)) {
        sb.append(securitycmd);
        if (containsSpace) {
            sb.append("\"");
        }
        sb.append(schedulingPath);
        sb.append(fs);
        sb.append("config");
        sb.append(fs);
        sb.append("security.java.policy-client");
        if (containsSpace) {
            sb.append("\"");
        }
        sb.append(" ");
    }
    // we set the log4j configuration file
    String log4jcmd = CentralPAPropertyRepository.LOG4J.getCmdLine();
    if (!this.javaOptions.contains(log4jcmd)) {
        sb.append(log4jcmd);
        if (containsSpace) {
            sb.append("\"");
        }
        // log4j only understands urls
        sb.append("file:");
        if (!schedulingPath.startsWith("/")) {
            sb.append("/" + schedulingPath.replace("\\", "/"));
        } else {
            sb.append(schedulingPath.replace("\\", "/"));
        }
        sb.append("/");
        sb.append("config");
        sb.append("/");
        sb.append("log");
        sb.append("/");
        sb.append("node.properties");
        if (containsSpace) {
            sb.append("\"");
        }
        sb.append(" ");
    }
    // we add extra java/PA configuration
    sb.append(this.javaOptions);
    clb.setPaProperties(sb.toString());
    // afterwards, node's name
    // generate the node name
    // current rmcore shortID should be added to ensure uniqueness
    final String nodeName = nodeNameBuilder.generateNodeName(hostTracker);
    clb.setNodeName(nodeName);
    clb.setNumberOfNodes(nbNodes);
    // finally, the credential's value
    String credString = null;
    try {
        credString = new String(getCredentials().getBase64());
    } catch (KeyException e1) {
        throw new RMException("Could not get base64 credentials", e1);
    }
    clb.setCredentialsValueAndNullOthers(credString);
    // add an expected node. every unexpected node will be discarded
    String cmdLine;
    String obfuscatedCmdLine;
    try {
        cmdLine = clb.buildCommandLine(true);
        obfuscatedCmdLine = clb.buildCommandLine(false);
    } catch (IOException e2) {
        throw new RMException("Cannot build the " + RMNodeStarter.class.getSimpleName() + "'s command line.", e2);
    }
    // one escape the command to make it runnable through ssh
    if (cmdLine.contains("\"")) {
        cmdLine = cmdLine.replaceAll("\"", "\\\\\"");
    }
    // we create a new deploying node before ssh command ran
    final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(nodeName, nbNodes);
    depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, obfuscatedCmdLine, "Deploying nodes on host " + hostTracker.getResolvedAddress(), super.nodeTimeOut));
    addTimeouts(depNodeURLs);
    Process p = null;
    try {
        p = Utils.runSSHCommand(hostTracker.getResolvedAddress(), cmdLine, sshOptions);
    } catch (IOException e1) {
        multipleDeclareDeployingNodeLost(depNodeURLs, "Cannot run command: " + cmdLine + ", with ssh options: " + sshOptions + " -\n The following exception occutred:\n " + getStackTraceAsString(e1));
        throw new RMException("Cannot run command: " + cmdLine + ", with ssh options: " + sshOptions, e1);
    }
    String lf = System.lineSeparator();
    int circuitBreakerThreshold = 5;
    while (!anyTimedOut(depNodeURLs) && circuitBreakerThreshold > 0) {
        try {
            int exitCode = p.exitValue();
            if (exitCode != 0) {
                logger.error("SSH subprocess at " + hostTracker.getResolvedAddress().getHostName() + " exited abnormally (" + exitCode + ").");
            } else {
                logger.error("Launching node process has exited normally whereas it shouldn't.");
            }
            String pOutPut = Utils.extractProcessOutput(p);
            String pErrPut = Utils.extractProcessErrput(p);
            final String description = "SSH command failed to launch node on host " + hostTracker.getResolvedAddress().getHostName() + lf + "   >Error code: " + exitCode + lf + "   >Errput: " + pErrPut + "   >Output: " + pOutPut;
            logger.error(description);
            if (super.checkAllNodesAreAcquiredAndDo(createdNodeNames, null, new Runnable() {

                public void run() {
                    SSHInfrastructure.this.multipleDeclareDeployingNodeLost(depNodeURLs, description);
                }
            })) {
                return;
            } else {
                // there isn't any race regarding node registration
                throw new RMException("SSH Node " + nodeName + " is not expected anymore because of an error.");
            }
        } catch (IllegalThreadStateException e) {
            logger.trace("IllegalThreadStateException while waiting for " + nodeName + " registration");
        }
        if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
            // registration is ok, we destroy the process
            p.destroy();
            return;
        }
        try {
            Thread.sleep(1000);
        } catch (Exception e) {
            circuitBreakerThreshold--;
            logger.trace("An exception occurred while monitoring ssh subprocess", e);
        }
    }
    // if we exit because of a timeout
    if (anyTimedOut(depNodeURLs)) {
        // we remove it
        removeTimeouts(depNodeURLs);
        // we destroy the process
        p.destroy();
        throw new RMException("Deploying Node " + nodeName + " not expected any more");
    }
    if (circuitBreakerThreshold <= 0) {
        logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
        throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
    }
}
Also used : Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) CommandLineBuilder(org.ow2.proactive.resourcemanager.utils.CommandLineBuilder) IOException(java.io.IOException) KeyException(java.security.KeyException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) KeyException(java.security.KeyException) IOException(java.io.IOException) RMException(org.ow2.proactive.resourcemanager.exception.RMException)

Aggregations

IOException (java.io.IOException)6 KeyException (java.security.KeyException)5 CommandLineBuilder (org.ow2.proactive.resourcemanager.utils.CommandLineBuilder)5 Throwables.getStackTraceAsString (com.google.common.base.Throwables.getStackTraceAsString)4 RMException (org.ow2.proactive.resourcemanager.exception.RMException)4 ArrayList (java.util.ArrayList)3 UnknownHostException (java.net.UnknownHostException)2 ChannelExec (com.jcraft.jsch.ChannelExec)1 JSch (com.jcraft.jsch.JSch)1 JSchException (com.jcraft.jsch.JSchException)1 Session (com.jcraft.jsch.Session)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 InputStream (java.io.InputStream)1 InetAddress (java.net.InetAddress)1 Properties (java.util.Properties)1 ExecutionException (java.util.concurrent.ExecutionException)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 TimeoutException (java.util.concurrent.TimeoutException)1 ProcessExecutor (org.ow2.proactive.process.ProcessExecutor)1