Search in sources :

Example 31 with Command

use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.

the class JavaPrefixCommandExtractor method extractJavaPrefixCommandToCommandListFromScriptResult.

/**
 * Extracts a java fork prefix command from a script result.
 *
 * @param scriptResult ScriptResult object from which the fork environment command is extracted.
 * @return Java prefix command, extracted out of the fork environment script variables.
 */
public List<String> extractJavaPrefixCommandToCommandListFromScriptResult(ScriptResult scriptResult) {
    List<String> javaPrefixCommand = new ArrayList<>();
    if (scriptResult != null && scriptResult.getResult() instanceof ForkEnvironmentScriptResult) {
        ForkEnvironmentScriptResult forkEnvResult = (ForkEnvironmentScriptResult) scriptResult.getResult();
        javaPrefixCommand.addAll(forkEnvResult.getJavaPrefixCommand());
    }
    return javaPrefixCommand;
}
Also used : ArrayList(java.util.ArrayList) ForkEnvironmentScriptResult(org.ow2.proactive.scripting.ForkEnvironmentScriptResult)

Example 32 with Command

use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.

the class AutoUpdateInfrastructure method startNodeImpl.

/**
 * Internal node acquisition method
 * <p>
 * Starts a PA runtime on remote host using a custom script, register it
 * manually in the nodesource.
 *
 * @param hostTracker The host on which one the node will be started
 * @param nbNodes number of nodes to deploy
 * @param depNodeURLs list of deploying or lost nodes urls created
 * @throws org.ow2.proactive.resourcemanager.exception.RMException
 *             acquisition failed
 */
protected void startNodeImpl(HostTracker hostTracker, int nbNodes, final List<String> depNodeURLs) throws RMException {
    final String nodeName = this.nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
    String credentials = "";
    try {
        credentials = new String(nodeSource.getAdministrator().getCredentials().getBase64());
    } catch (KeyException e) {
        logger.error("Invalid credentials");
        return;
    }
    Properties localProperties = new Properties();
    localProperties.put(NODE_NAME, nodeName);
    localProperties.put(HOST_NAME, hostTracker.getResolvedAddress().getHostName());
    localProperties.put(NODESOURCE_CREDENTIALS, credentials);
    localProperties.put(NODESOURCE_NAME, nodeSource.getName());
    localProperties.put(NB_NODES, nbNodes);
    String filledCommand = replaceProperties(command, localProperties);
    filledCommand = replaceProperties(filledCommand, System.getProperties());
    final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(nodeName, nbNodes);
    depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, filledCommand, "Deploying node on host " + hostTracker.getResolvedAddress(), this.nodeTimeOut));
    addTimeouts(depNodeURLs);
    Process p;
    try {
        logger.debug("Deploying node: " + nodeName);
        logger.debug("Launching the command: " + filledCommand);
        p = Runtime.getRuntime().exec(new String[] { "bash", "-c", filledCommand });
    } catch (IOException e1) {
        multipleDeclareDeployingNodeLost(depNodeURLs, "Cannot run command: " + filledCommand + " - \n The following exception occurred: " + getStackTraceAsString(e1));
        throw new RMException("Cannot run command: " + filledCommand, e1);
    }
    String lf = System.lineSeparator();
    int circuitBreakerThreshold = 5;
    while (!anyTimedOut(depNodeURLs) && circuitBreakerThreshold > 0) {
        try {
            int exitCode = p.exitValue();
            if (exitCode != 0) {
                logger.error("Child process at " + hostTracker.getResolvedAddress().getHostName() + " exited abnormally (" + exitCode + ").");
            } else {
                logger.error("Launching node script has exited normally whereas it shouldn't.");
            }
            String pOutPut = Utils.extractProcessOutput(p);
            String pErrPut = Utils.extractProcessErrput(p);
            final String description = "Script failed to launch a node on host " + hostTracker.getResolvedAddress().getHostName() + lf + "   >Error code: " + exitCode + lf + "   >Errput: " + pErrPut + "   >Output: " + pOutPut;
            logger.error(description);
            if (super.checkNodeIsAcquiredAndDo(nodeName, null, new Runnable() {

                public void run() {
                    multipleDeclareDeployingNodeLost(depNodeURLs, description);
                }
            })) {
                return;
            } else {
                // there isn't any race regarding node registration
                throw new RMException("A node " + nodeName + " is not expected anymore because of an error.");
            }
        } catch (IllegalThreadStateException e) {
            logger.trace("IllegalThreadStateException while waiting for " + nodeName + " registration");
        }
        if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
            // registration is ok, we destroy the process
            logger.debug("Destroying the process: " + p);
            try {
                ProcessTree.get().get(p).kill();
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
            return;
        }
        try {
            Thread.sleep(1000);
        } catch (Exception e) {
            circuitBreakerThreshold--;
            logger.trace("An exception occurred while monitoring a child process", e);
        }
    }
    // if we exit because of a timeout
    if (anyTimedOut(depNodeURLs)) {
        // we remove it
        removeTimeouts(depNodeURLs);
        // we destroy the process
        p.destroy();
        throw new RMException("Deploying Node " + nodeName + " not expected any more");
    }
    if (circuitBreakerThreshold <= 0) {
        logger.error("Circuit breaker threshold reached while monitoring a child process.");
        throw new RMException("Several exceptions occurred while monitoring a child process.");
    }
}
Also used : Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) IOException(java.io.IOException) Properties(java.util.Properties) KeyException(java.security.KeyException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) KeyException(java.security.KeyException) IOException(java.io.IOException) RMException(org.ow2.proactive.resourcemanager.exception.RMException)

Example 33 with Command

use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.

the class BatchJobInfrastructure method deleteJob.

/**
 * Runs a {@link #getDeleteJobCommand()} command on the remote host for the
 * given jobID and monitors the exit.
 *
 * @param jobID
 *            the jobID string to delete
 * @throws RMException
 *             if the {@link #getDeleteJobCommand()} command failed
 */
private void deleteJob(String jobID) throws RMException {
    String deleteCmd = getDeleteJobCommand();
    String cmd = deleteCmd + " " + jobID;
    Process del = null;
    try {
        del = Utils.runSSHCommand(InetAddress.getByName(this.serverName), cmd, this.sshOptions);
    } catch (Exception e1) {
        logger.warn("Cannot ssh " + this.serverName + " to issue " + deleteCmd + " command. job with jobID: " + jobID + " won't be deleted.", e1);
        throw new RMException("Cannot ssh " + this.serverName + " to issue " + deleteCmd + " command. job with jobID: " + jobID + " won't be deleted.", e1);
    }
    long timeStamp = System.currentTimeMillis();
    while (true) {
        try {
            int exitCode = del.exitValue();
            if (exitCode != 0) {
                logger.error("Cannot delete job " + jobID + ". " + deleteCmd + " command returned != 0 -> " + exitCode);
                throw new RMException("Cannot delete job " + jobID + ". " + deleteCmd + " command returned != 0 -> " + exitCode);
            } else {
                logger.debug("Job " + jobID + " deleted.");
                return;
            }
        } catch (IllegalThreadStateException e) {
            // the thread hasn't exited yet... don't eat exception, trace
            // it...
            logger.trace("waiting for " + deleteCmd + " exit code.", e);
        }
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            // the thread was interrupted... don't eat exception, trace
            // it...
            logger.trace("sleep interrupted while waiting for " + deleteCmd + " to exit.", e);
        }
        if ((System.currentTimeMillis() - timeStamp) >= nodeTimeOut) {
            logger.error("Cannot delete job " + jobID + ". " + deleteCmd + " command timed out.");
            throw new RMException("Cannot delete job " + jobID + ". " + deleteCmd + " command timed out.");
        }
    }
}
Also used : Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) KeyException(java.security.KeyException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) RMException(org.ow2.proactive.resourcemanager.exception.RMException)

Example 34 with Command

use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.

the class BatchJobInfrastructure method startNode.

/**
 * Builds the command line to execute on the PBS frontend and wait for every
 * launched nodes to register. If the node doesn't register (ie. runs
 * {@link #internalRegisterAcquiredNode(Node)} isn't called) before the
 * timeout (configurable) value, an exception is raised. If the qSub command
 * submitted to the PBS frontend fails, the node supposed to be launched is
 * not expected anymore and will be discarded at registration time.
 *
 * @throws RMException
 */
private void startNode() throws RMException {
    CommandLineBuilder clb = new CommandLineBuilder();
    // generate the node name
    // current rmcore shortID should be added to ensure uniqueness
    String nodeName = getBatchinJobSystemName() + "-" + nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
    clb.setNodeName(nodeName);
    clb.setJavaPath(this.javaPath);
    clb.setRmURL(getRmUrl());
    clb.setRmHome(this.schedulingPath);
    clb.setSourceName(this.nodeSource.getName());
    clb.setPaProperties(this.javaOptions);
    try {
        clb.setCredentialsValueAndNullOthers(new String(getCredentials().getBase64()));
    } catch (KeyException e) {
        this.handleFailedDeployment(clb, e);
    }
    InetAddress host = null;
    try {
        host = InetAddress.getByName(this.serverName);
    } catch (UnknownHostException e) {
        this.handleFailedDeployment(clb, e);
    }
    String deleteCmd = getDeleteJobCommand();
    String submitCmd = getSubmitJobCommand();
    // build the command: echo "script.sh params"|qsub params
    String cmd = null;
    String obfuscatedCmd = null;
    try {
        cmd = "echo \\\"" + clb.buildCommandLine(true).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
        obfuscatedCmd = "echo \\\"" + clb.buildCommandLine(false).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
    } catch (IOException e) {
        this.handleFailedDeployment(clb, e);
    }
    // add an deploying node.
    final String dnURL = super.addDeployingNode(nodeName, obfuscatedCmd, "Deploying node on " + getBatchinJobSystemName() + " scheduler", this.nodeTimeOut);
    putPnTimeout(dnURL, Boolean.FALSE);
    // executing the command
    Process p;
    try {
        p = Utils.runSSHCommand(host, cmd, this.sshOptions);
    } catch (IOException e1) {
        throw new RMException("Cannot execute ssh command: " + cmd + " on host: " + this.serverName, e1);
    }
    // recover the Job ID through stdout
    String id = "";
    InputStream in = p.getInputStream();
    int b = -1;
    try {
        while ((b = in.read()) > -1) {
            id += (char) b;
        }
    } catch (IOException e) {
    }
    // check for registration
    // at this point, the ssh process should have already exited because it
    // only handle the job submission, not the execution... furthermore
    // the "id" is defined
    String lf = System.lineSeparator();
    final long timeout = nodeTimeOut;
    long t1 = System.currentTimeMillis();
    // Hack. SSHClient fails but qSub succeeds.
    boolean isJobIDValid = false;
    // Tries to wait for this node
    // registration...
    int circuitBreakerThreshold = 5;
    while (!getPnTimeout(dnURL) && circuitBreakerThreshold > 0) {
        try {
            int exitCode = p.exitValue();
            if (exitCode != 0 && !isJobIDValid) {
                logger.warn("SSH subprocess at " + host.getHostName() + " exit code != 0 but IM tries to recover from this error...Current submit command's output: " + id + " and associated node's name: " + nodeName);
                String extractedID = this.extractSubmitOutput(id);
                String errput = this.extractProcessErrput(p);
                final String description = "SSH command failed to launch node on " + getBatchinJobSystemName() + " scheduler" + lf + "   >Error code: " + exitCode + lf + "   >Errput: " + errput + "   >Output: " + id;
                // registration...
                if (extractedID != null && !extractedID.equals("")) {
                    isJobIDValid = true;
                }
                // defines how to recover from this state
                // throws a RMException if we can't
                handleWrongJobTermination(isJobIDValid, nodeName, dnURL, host, id, description, exitCode, submitCmd, deleteCmd);
            }
        } catch (IllegalThreadStateException e) {
            // process has not returned yet
            logger.trace("Waiting for ssh process to exit in BatchJobInfrastructure");
        }
        if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
            // registration is ok
            p.destroy();
            addNodeAndDecrementDeployingNode(nodeName, this.extractSubmitOutput(id));
            return;
        }
        try {
            logger.debug("Waiting for node " + nodeName + " registration... time to timeout: " + (timeout - (System.currentTimeMillis() - t1)));
            Thread.sleep(BatchJobInfrastructure.NODE_ACQUISITION_CHECK_RATE);
        } catch (Exception e) {
            circuitBreakerThreshold--;
            logger.error("While monitoring ssh subprocess.", e);
        }
    }
    // end of while loop, either deploying node timeout/removed of
    // threshold reached
    // the node is not expected anymore
    atomicRemovePnTimeoutAndJob(nodeName, dnURL, p, id);
    if (circuitBreakerThreshold <= 0) {
        logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
        throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
    }
    // if we are here we reached an invalid state
    throw new RMException("Invalid state, exit from a control loop with threshold > 0 and expected deploying node");
}
Also used : UnknownHostException(java.net.UnknownHostException) InputStream(java.io.InputStream) CommandLineBuilder(org.ow2.proactive.resourcemanager.utils.CommandLineBuilder) Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) IOException(java.io.IOException) KeyException(java.security.KeyException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) KeyException(java.security.KeyException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) RMException(org.ow2.proactive.resourcemanager.exception.RMException) InetAddress(java.net.InetAddress)

Example 35 with Command

use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.

the class CLIInfrastructure method killNodeImpl.

/**
 * {@inheritDoc}
 */
@Override
protected void killNodeImpl(Node node, InetAddress h) {
    final Node n = node;
    final InetAddress host = h;
    incrementNbRemovalThread();
    this.nodeSource.executeInParallel(new Runnable() {

        public void run() {
            try {
                final String commandLine = interpreter + " " + removalScript.getAbsolutePath() + " " + host.getHostName() + " " + n.getNodeInformation().getURL();
                Process p;
                try {
                    logger.debug("Launching the command: " + commandLine);
                    p = Runtime.getRuntime().exec(commandLine);
                    // TODO add timeout behavior
                    int exitCode = p.waitFor();
                    String pOutPut = Utils.extractProcessOutput(p);
                    String pErrPut = Utils.extractProcessErrput(p);
                    String lf = System.lineSeparator();
                    final String description = "Removal script ouput" + lf + "   >Error code: " + exitCode + lf + "   >Errput: " + pErrPut + "   >Output: " + pOutPut;
                    if (exitCode != 0) {
                        logger.error("Child process at " + host.getHostName() + " exited abnormally (" + exitCode + ").");
                        logger.error(description);
                    } else {
                        logger.info("Removal node process has exited normally for " + n.getNodeInformation().getURL());
                        logger.debug(description);
                    }
                } catch (IOException e1) {
                    logger.error(e1);
                }
            } catch (Exception e) {
                logger.trace("An exception occurred during node removal", e);
            }
            decrementNbRemovalThread();
        }
    });
}
Also used : Node(org.objectweb.proactive.core.node.Node) Throwables.getStackTraceAsString(com.google.common.base.Throwables.getStackTraceAsString) IOException(java.io.IOException) InetAddress(java.net.InetAddress) IOException(java.io.IOException) RMException(org.ow2.proactive.resourcemanager.exception.RMException)

Aggregations

Test (org.junit.Test)12 IOException (java.io.IOException)11 RMDeployingNode (org.ow2.proactive.resourcemanager.rmnode.RMDeployingNode)10 Throwables.getStackTraceAsString (com.google.common.base.Throwables.getStackTraceAsString)9 ArrayList (java.util.ArrayList)9 RMException (org.ow2.proactive.resourcemanager.exception.RMException)9 TaskFlowJob (org.ow2.proactive.scheduler.common.job.TaskFlowJob)9 NativeTask (org.ow2.proactive.scheduler.common.task.NativeTask)9 KeyException (java.security.KeyException)8 File (java.io.File)7 ForkEnvironment (org.ow2.proactive.scheduler.common.task.ForkEnvironment)6 JobCreationException (org.ow2.proactive.scheduler.common.exception.JobCreationException)5 CommandLineBuilder (org.ow2.proactive.resourcemanager.utils.CommandLineBuilder)4 JavaTask (org.ow2.proactive.scheduler.common.task.JavaTask)4 UnknownHostException (java.net.UnknownHostException)3 Client (org.ow2.proactive.resourcemanager.authentication.Client)3 Command (org.ow2.proactive_grid_cloud_portal.cli.cmd.Command)3 InetAddress (java.net.InetAddress)2 Properties (java.util.Properties)2 ExecutionException (java.util.concurrent.ExecutionException)2