use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.
the class JavaPrefixCommandExtractor method extractJavaPrefixCommandToCommandListFromScriptResult.
/**
* Extracts a java fork prefix command from a script result.
*
* @param scriptResult ScriptResult object from which the fork environment command is extracted.
* @return Java prefix command, extracted out of the fork environment script variables.
*/
public List<String> extractJavaPrefixCommandToCommandListFromScriptResult(ScriptResult scriptResult) {
List<String> javaPrefixCommand = new ArrayList<>();
if (scriptResult != null && scriptResult.getResult() instanceof ForkEnvironmentScriptResult) {
ForkEnvironmentScriptResult forkEnvResult = (ForkEnvironmentScriptResult) scriptResult.getResult();
javaPrefixCommand.addAll(forkEnvResult.getJavaPrefixCommand());
}
return javaPrefixCommand;
}
use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.
the class AutoUpdateInfrastructure method startNodeImpl.
/**
* Internal node acquisition method
* <p>
* Starts a PA runtime on remote host using a custom script, register it
* manually in the nodesource.
*
* @param hostTracker The host on which one the node will be started
* @param nbNodes number of nodes to deploy
* @param depNodeURLs list of deploying or lost nodes urls created
* @throws org.ow2.proactive.resourcemanager.exception.RMException
* acquisition failed
*/
protected void startNodeImpl(HostTracker hostTracker, int nbNodes, final List<String> depNodeURLs) throws RMException {
final String nodeName = this.nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
String credentials = "";
try {
credentials = new String(nodeSource.getAdministrator().getCredentials().getBase64());
} catch (KeyException e) {
logger.error("Invalid credentials");
return;
}
Properties localProperties = new Properties();
localProperties.put(NODE_NAME, nodeName);
localProperties.put(HOST_NAME, hostTracker.getResolvedAddress().getHostName());
localProperties.put(NODESOURCE_CREDENTIALS, credentials);
localProperties.put(NODESOURCE_NAME, nodeSource.getName());
localProperties.put(NB_NODES, nbNodes);
String filledCommand = replaceProperties(command, localProperties);
filledCommand = replaceProperties(filledCommand, System.getProperties());
final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(nodeName, nbNodes);
depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, filledCommand, "Deploying node on host " + hostTracker.getResolvedAddress(), this.nodeTimeOut));
addTimeouts(depNodeURLs);
Process p;
try {
logger.debug("Deploying node: " + nodeName);
logger.debug("Launching the command: " + filledCommand);
p = Runtime.getRuntime().exec(new String[] { "bash", "-c", filledCommand });
} catch (IOException e1) {
multipleDeclareDeployingNodeLost(depNodeURLs, "Cannot run command: " + filledCommand + " - \n The following exception occurred: " + getStackTraceAsString(e1));
throw new RMException("Cannot run command: " + filledCommand, e1);
}
String lf = System.lineSeparator();
int circuitBreakerThreshold = 5;
while (!anyTimedOut(depNodeURLs) && circuitBreakerThreshold > 0) {
try {
int exitCode = p.exitValue();
if (exitCode != 0) {
logger.error("Child process at " + hostTracker.getResolvedAddress().getHostName() + " exited abnormally (" + exitCode + ").");
} else {
logger.error("Launching node script has exited normally whereas it shouldn't.");
}
String pOutPut = Utils.extractProcessOutput(p);
String pErrPut = Utils.extractProcessErrput(p);
final String description = "Script failed to launch a node on host " + hostTracker.getResolvedAddress().getHostName() + lf + " >Error code: " + exitCode + lf + " >Errput: " + pErrPut + " >Output: " + pOutPut;
logger.error(description);
if (super.checkNodeIsAcquiredAndDo(nodeName, null, new Runnable() {
public void run() {
multipleDeclareDeployingNodeLost(depNodeURLs, description);
}
})) {
return;
} else {
// there isn't any race regarding node registration
throw new RMException("A node " + nodeName + " is not expected anymore because of an error.");
}
} catch (IllegalThreadStateException e) {
logger.trace("IllegalThreadStateException while waiting for " + nodeName + " registration");
}
if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
// registration is ok, we destroy the process
logger.debug("Destroying the process: " + p);
try {
ProcessTree.get().get(p).kill();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return;
}
try {
Thread.sleep(1000);
} catch (Exception e) {
circuitBreakerThreshold--;
logger.trace("An exception occurred while monitoring a child process", e);
}
}
// if we exit because of a timeout
if (anyTimedOut(depNodeURLs)) {
// we remove it
removeTimeouts(depNodeURLs);
// we destroy the process
p.destroy();
throw new RMException("Deploying Node " + nodeName + " not expected any more");
}
if (circuitBreakerThreshold <= 0) {
logger.error("Circuit breaker threshold reached while monitoring a child process.");
throw new RMException("Several exceptions occurred while monitoring a child process.");
}
}
use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.
the class BatchJobInfrastructure method deleteJob.
/**
* Runs a {@link #getDeleteJobCommand()} command on the remote host for the
* given jobID and monitors the exit.
*
* @param jobID
* the jobID string to delete
* @throws RMException
* if the {@link #getDeleteJobCommand()} command failed
*/
private void deleteJob(String jobID) throws RMException {
String deleteCmd = getDeleteJobCommand();
String cmd = deleteCmd + " " + jobID;
Process del = null;
try {
del = Utils.runSSHCommand(InetAddress.getByName(this.serverName), cmd, this.sshOptions);
} catch (Exception e1) {
logger.warn("Cannot ssh " + this.serverName + " to issue " + deleteCmd + " command. job with jobID: " + jobID + " won't be deleted.", e1);
throw new RMException("Cannot ssh " + this.serverName + " to issue " + deleteCmd + " command. job with jobID: " + jobID + " won't be deleted.", e1);
}
long timeStamp = System.currentTimeMillis();
while (true) {
try {
int exitCode = del.exitValue();
if (exitCode != 0) {
logger.error("Cannot delete job " + jobID + ". " + deleteCmd + " command returned != 0 -> " + exitCode);
throw new RMException("Cannot delete job " + jobID + ". " + deleteCmd + " command returned != 0 -> " + exitCode);
} else {
logger.debug("Job " + jobID + " deleted.");
return;
}
} catch (IllegalThreadStateException e) {
// the thread hasn't exited yet... don't eat exception, trace
// it...
logger.trace("waiting for " + deleteCmd + " exit code.", e);
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// the thread was interrupted... don't eat exception, trace
// it...
logger.trace("sleep interrupted while waiting for " + deleteCmd + " to exit.", e);
}
if ((System.currentTimeMillis() - timeStamp) >= nodeTimeOut) {
logger.error("Cannot delete job " + jobID + ". " + deleteCmd + " command timed out.");
throw new RMException("Cannot delete job " + jobID + ". " + deleteCmd + " command timed out.");
}
}
}
use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.
the class BatchJobInfrastructure method startNode.
/**
* Builds the command line to execute on the PBS frontend and wait for every
* launched nodes to register. If the node doesn't register (ie. runs
* {@link #internalRegisterAcquiredNode(Node)} isn't called) before the
* timeout (configurable) value, an exception is raised. If the qSub command
* submitted to the PBS frontend fails, the node supposed to be launched is
* not expected anymore and will be discarded at registration time.
*
* @throws RMException
*/
private void startNode() throws RMException {
CommandLineBuilder clb = new CommandLineBuilder();
// generate the node name
// current rmcore shortID should be added to ensure uniqueness
String nodeName = getBatchinJobSystemName() + "-" + nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
clb.setNodeName(nodeName);
clb.setJavaPath(this.javaPath);
clb.setRmURL(getRmUrl());
clb.setRmHome(this.schedulingPath);
clb.setSourceName(this.nodeSource.getName());
clb.setPaProperties(this.javaOptions);
try {
clb.setCredentialsValueAndNullOthers(new String(getCredentials().getBase64()));
} catch (KeyException e) {
this.handleFailedDeployment(clb, e);
}
InetAddress host = null;
try {
host = InetAddress.getByName(this.serverName);
} catch (UnknownHostException e) {
this.handleFailedDeployment(clb, e);
}
String deleteCmd = getDeleteJobCommand();
String submitCmd = getSubmitJobCommand();
// build the command: echo "script.sh params"|qsub params
String cmd = null;
String obfuscatedCmd = null;
try {
cmd = "echo \\\"" + clb.buildCommandLine(true).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
obfuscatedCmd = "echo \\\"" + clb.buildCommandLine(false).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
} catch (IOException e) {
this.handleFailedDeployment(clb, e);
}
// add an deploying node.
final String dnURL = super.addDeployingNode(nodeName, obfuscatedCmd, "Deploying node on " + getBatchinJobSystemName() + " scheduler", this.nodeTimeOut);
putPnTimeout(dnURL, Boolean.FALSE);
// executing the command
Process p;
try {
p = Utils.runSSHCommand(host, cmd, this.sshOptions);
} catch (IOException e1) {
throw new RMException("Cannot execute ssh command: " + cmd + " on host: " + this.serverName, e1);
}
// recover the Job ID through stdout
String id = "";
InputStream in = p.getInputStream();
int b = -1;
try {
while ((b = in.read()) > -1) {
id += (char) b;
}
} catch (IOException e) {
}
// check for registration
// at this point, the ssh process should have already exited because it
// only handle the job submission, not the execution... furthermore
// the "id" is defined
String lf = System.lineSeparator();
final long timeout = nodeTimeOut;
long t1 = System.currentTimeMillis();
// Hack. SSHClient fails but qSub succeeds.
boolean isJobIDValid = false;
// Tries to wait for this node
// registration...
int circuitBreakerThreshold = 5;
while (!getPnTimeout(dnURL) && circuitBreakerThreshold > 0) {
try {
int exitCode = p.exitValue();
if (exitCode != 0 && !isJobIDValid) {
logger.warn("SSH subprocess at " + host.getHostName() + " exit code != 0 but IM tries to recover from this error...Current submit command's output: " + id + " and associated node's name: " + nodeName);
String extractedID = this.extractSubmitOutput(id);
String errput = this.extractProcessErrput(p);
final String description = "SSH command failed to launch node on " + getBatchinJobSystemName() + " scheduler" + lf + " >Error code: " + exitCode + lf + " >Errput: " + errput + " >Output: " + id;
// registration...
if (extractedID != null && !extractedID.equals("")) {
isJobIDValid = true;
}
// defines how to recover from this state
// throws a RMException if we can't
handleWrongJobTermination(isJobIDValid, nodeName, dnURL, host, id, description, exitCode, submitCmd, deleteCmd);
}
} catch (IllegalThreadStateException e) {
// process has not returned yet
logger.trace("Waiting for ssh process to exit in BatchJobInfrastructure");
}
if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
// registration is ok
p.destroy();
addNodeAndDecrementDeployingNode(nodeName, this.extractSubmitOutput(id));
return;
}
try {
logger.debug("Waiting for node " + nodeName + " registration... time to timeout: " + (timeout - (System.currentTimeMillis() - t1)));
Thread.sleep(BatchJobInfrastructure.NODE_ACQUISITION_CHECK_RATE);
} catch (Exception e) {
circuitBreakerThreshold--;
logger.error("While monitoring ssh subprocess.", e);
}
}
// end of while loop, either deploying node timeout/removed of
// threshold reached
// the node is not expected anymore
atomicRemovePnTimeoutAndJob(nodeName, dnURL, p, id);
if (circuitBreakerThreshold <= 0) {
logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
}
// if we are here we reached an invalid state
throw new RMException("Invalid state, exit from a control loop with threshold > 0 and expected deploying node");
}
use of org.ow2.proactive_grid_cloud_portal.cli.cmd.Command in project scheduling by ow2-proactive.
the class CLIInfrastructure method killNodeImpl.
/**
* {@inheritDoc}
*/
@Override
protected void killNodeImpl(Node node, InetAddress h) {
final Node n = node;
final InetAddress host = h;
incrementNbRemovalThread();
this.nodeSource.executeInParallel(new Runnable() {
public void run() {
try {
final String commandLine = interpreter + " " + removalScript.getAbsolutePath() + " " + host.getHostName() + " " + n.getNodeInformation().getURL();
Process p;
try {
logger.debug("Launching the command: " + commandLine);
p = Runtime.getRuntime().exec(commandLine);
// TODO add timeout behavior
int exitCode = p.waitFor();
String pOutPut = Utils.extractProcessOutput(p);
String pErrPut = Utils.extractProcessErrput(p);
String lf = System.lineSeparator();
final String description = "Removal script ouput" + lf + " >Error code: " + exitCode + lf + " >Errput: " + pErrPut + " >Output: " + pOutPut;
if (exitCode != 0) {
logger.error("Child process at " + host.getHostName() + " exited abnormally (" + exitCode + ").");
logger.error(description);
} else {
logger.info("Removal node process has exited normally for " + n.getNodeInformation().getURL());
logger.debug(description);
}
} catch (IOException e1) {
logger.error(e1);
}
} catch (Exception e) {
logger.trace("An exception occurred during node removal", e);
}
decrementNbRemovalThread();
}
});
}
Aggregations