use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.
the class BatchJobInfrastructure method handleFailedDeployment.
/**
* Creates a lost node to notify the user that the deployment
* has failed because of an error
*
* @param clb
* @param e
* the error that caused the deployment to failed.
* @throws RMException
*/
private void handleFailedDeployment(CommandLineBuilder clb, Throwable e) throws RMException {
String error = getStackTraceAsString(e);
String command = null;
try {
command = clb.buildCommandLine(false);
} catch (Exception ex) {
command = "Cannot determine the command used to start the node.";
}
String lostNode = super.addDeployingNode(clb.getNodeName(), command, "Cannot deploy the node because of an error:" + System.lineSeparator() + error, 60000);
super.declareDeployingNodeLost(lostNode, null);
throw new RMException("The deployment failed because of an error", e);
}
use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.
the class InfrastructureManager method getDefaultCommandLineBuilder.
// **********************************************************************************************\\
// **************************************** API methods
// *****************************************\\
// **********************************************************************************************\\
/**
* This method returns a
* {@link org.ow2.proactive.resourcemanager.utils.CommandLineBuilder} filled
* in with "default" settings. That means that the returned
* CommandLineBuilder is useable as such.
* <ul>
* <li>It tries to set the Java Path to use, either JAVA_HOME retrieved from
* your environment or java.home set by Java itself.</li>
* <li>The target operating system is set to {@link OperatingSystem#UNIX}
* </li>
* <li>If a ProActive configuration file is provided, it is used as such.
* </li>
* <li>Finally, it tries to set the nodesource's name, the rm's URL and the
* node's name.</li>
* </ul>
*
* @param targetOS
* the operating system on which one the node will be deployed
*/
protected final CommandLineBuilder getDefaultCommandLineBuilder(OperatingSystem targetOS) {
CommandLineBuilder result = new CommandLineBuilder();
String javaPath = System.getProperty("java.home") + targetOS.fs + "bin" + targetOS.fs + "java";
result.setJavaPath(javaPath);
result.setTargetOS(targetOS);
if (CentralPAPropertyRepository.PA_CONFIGURATION_FILE.isSet()) {
try {
result.setPaProperties(new File(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getValue()));
} catch (IOException e) {
logger.debug("Cannot set default pa configuration file for " + CommandLineBuilder.class.getSimpleName(), e);
}
}
result.setRmURL(getRmUrl());
if (this.nodeSource != null) {
String nsName = this.nodeSource.getName();
result.setSourceName(nsName);
result.setNodeName(nsName + "_DefaultNodeName");
}
return result;
}
use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.
the class BatchJobInfrastructure method startNode.
/**
* Builds the command line to execute on the PBS frontend and wait for every
* launched nodes to register. If the node doesn't register (ie. runs
* {@link #internalRegisterAcquiredNode(Node)} isn't called) before the
* timeout (configurable) value, an exception is raised. If the qSub command
* submitted to the PBS frontend fails, the node supposed to be launched is
* not expected anymore and will be discarded at registration time.
*
* @throws RMException
*/
private void startNode() throws RMException {
CommandLineBuilder clb = new CommandLineBuilder();
// generate the node name
// current rmcore shortID should be added to ensure uniqueness
String nodeName = getBatchinJobSystemName() + "-" + nodeSource.getName() + "-" + ProActiveCounter.getUniqID();
clb.setNodeName(nodeName);
clb.setJavaPath(this.javaPath);
clb.setRmURL(getRmUrl());
clb.setRmHome(this.schedulingPath);
clb.setSourceName(this.nodeSource.getName());
clb.setPaProperties(this.javaOptions);
try {
clb.setCredentialsValueAndNullOthers(new String(getCredentials().getBase64()));
} catch (KeyException e) {
this.handleFailedDeployment(clb, e);
}
InetAddress host = null;
try {
host = InetAddress.getByName(this.serverName);
} catch (UnknownHostException e) {
this.handleFailedDeployment(clb, e);
}
String deleteCmd = getDeleteJobCommand();
String submitCmd = getSubmitJobCommand();
// build the command: echo "script.sh params"|qsub params
String cmd = null;
String obfuscatedCmd = null;
try {
cmd = "echo \\\"" + clb.buildCommandLine(true).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
obfuscatedCmd = "echo \\\"" + clb.buildCommandLine(false).replace("\"", "\\\"") + "\\\" | " + submitCmd + " " + this.submitJobOpt;
} catch (IOException e) {
this.handleFailedDeployment(clb, e);
}
// add an deploying node.
final String dnURL = super.addDeployingNode(nodeName, obfuscatedCmd, "Deploying node on " + getBatchinJobSystemName() + " scheduler", this.nodeTimeOut);
putPnTimeout(dnURL, Boolean.FALSE);
// executing the command
Process p;
try {
p = Utils.runSSHCommand(host, cmd, this.sshOptions);
} catch (IOException e1) {
throw new RMException("Cannot execute ssh command: " + cmd + " on host: " + this.serverName, e1);
}
// recover the Job ID through stdout
String id = "";
InputStream in = p.getInputStream();
int b = -1;
try {
while ((b = in.read()) > -1) {
id += (char) b;
}
} catch (IOException e) {
}
// check for registration
// at this point, the ssh process should have already exited because it
// only handle the job submission, not the execution... furthermore
// the "id" is defined
String lf = System.lineSeparator();
final long timeout = nodeTimeOut;
long t1 = System.currentTimeMillis();
// Hack. SSHClient fails but qSub succeeds.
boolean isJobIDValid = false;
// Tries to wait for this node
// registration...
int circuitBreakerThreshold = 5;
while (!getPnTimeout(dnURL) && circuitBreakerThreshold > 0) {
try {
int exitCode = p.exitValue();
if (exitCode != 0 && !isJobIDValid) {
logger.warn("SSH subprocess at " + host.getHostName() + " exit code != 0 but IM tries to recover from this error...Current submit command's output: " + id + " and associated node's name: " + nodeName);
String extractedID = this.extractSubmitOutput(id);
String errput = this.extractProcessErrput(p);
final String description = "SSH command failed to launch node on " + getBatchinJobSystemName() + " scheduler" + lf + " >Error code: " + exitCode + lf + " >Errput: " + errput + " >Output: " + id;
// registration...
if (extractedID != null && !extractedID.equals("")) {
isJobIDValid = true;
}
// defines how to recover from this state
// throws a RMException if we can't
handleWrongJobTermination(isJobIDValid, nodeName, dnURL, host, id, description, exitCode, submitCmd, deleteCmd);
}
} catch (IllegalThreadStateException e) {
// process has not returned yet
logger.trace("Waiting for ssh process to exit in BatchJobInfrastructure");
}
if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
// registration is ok
p.destroy();
addNodeAndDecrementDeployingNode(nodeName, this.extractSubmitOutput(id));
return;
}
try {
logger.debug("Waiting for node " + nodeName + " registration... time to timeout: " + (timeout - (System.currentTimeMillis() - t1)));
Thread.sleep(BatchJobInfrastructure.NODE_ACQUISITION_CHECK_RATE);
} catch (Exception e) {
circuitBreakerThreshold--;
logger.error("While monitoring ssh subprocess.", e);
}
}
// end of while loop, either deploying node timeout/removed of
// threshold reached
// the node is not expected anymore
atomicRemovePnTimeoutAndJob(nodeName, dnURL, p, id);
if (circuitBreakerThreshold <= 0) {
logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
}
// if we are here we reached an invalid state
throw new RMException("Invalid state, exit from a control loop with threshold > 0 and expected deploying node");
}
use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.
the class LocalInfrastructure method startNodeProcess.
private void startNodeProcess(int numberOfNodes) {
int currentIndex = getIndexAndIncrement();
String baseNodeName = "local-" + this.nodeSource.getName() + "-" + currentIndex;
OperatingSystem os = OperatingSystem.UNIX;
// assuming no cygwin, windows or the "others"...
if (System.getProperty("os.name").contains("Windows")) {
os = OperatingSystem.WINDOWS;
}
String rmHome = PAResourceManagerProperties.RM_HOME.getValueAsString();
if (!rmHome.endsWith(os.fs)) {
rmHome += os.fs;
}
CommandLineBuilder clb = this.getDefaultCommandLineBuilder(os);
// RM_Home set in bin/unix/env script
clb.setRmHome(rmHome);
ArrayList<String> paPropList = new ArrayList<>();
if (!this.paProperties.contains(CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getName())) {
paPropList.add(CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getCmdLine() + rmHome + "config" + os.fs + "security.java.policy-client");
}
if (!this.paProperties.contains(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getName())) {
paPropList.add(CentralPAPropertyRepository.PA_CONFIGURATION_FILE.getCmdLine() + rmHome + "config" + os.fs + "network" + os.fs + "node.ini");
}
if (!this.paProperties.contains(PAResourceManagerProperties.RM_HOME.getKey())) {
paPropList.add(PAResourceManagerProperties.RM_HOME.getCmdLine() + rmHome);
}
if (!this.paProperties.contains("java.library.path")) {
paPropList.add("-Djava.library.path=" + System.getProperty("java.library.path"));
}
if (!paProperties.isEmpty()) {
Collections.addAll(paPropList, this.paProperties.split(" "));
}
clb.setPaProperties(paPropList);
clb.setNodeName(baseNodeName);
clb.setNumberOfNodes(numberOfNodes);
try {
clb.setCredentialsValueAndNullOthers(new String(this.credentials.getBase64()));
} catch (KeyException e) {
createLostNodes(baseNodeName, numberOfNodes, "Cannot decrypt credentials value", e);
return;
}
List<String> cmd;
try {
cmd = clb.buildCommandLineAsList(false);
} catch (IOException e) {
createLostNodes(baseNodeName, numberOfNodes, "Cannot build command line", e);
return;
}
// The printed cmd with obfuscated credentials
final String obfuscatedCmd = Joiner.on(' ').join(cmd);
List<String> depNodeURLs = new ArrayList<>(numberOfNodes);
final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(baseNodeName, numberOfNodes);
ProcessExecutor processExecutor = null;
try {
depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, obfuscatedCmd, "Node launched locally", this.nodeTimeout));
// Deobfuscate the cred value
Collections.replaceAll(cmd, CommandLineBuilder.OBFUSC, clb.getCredentialsValue());
processExecutor = new ProcessExecutor(baseNodeName, cmd, false, true);
processExecutor.start();
processExecutors.put(processExecutor, depNodeURLs);
final ProcessExecutor tmpProcessExecutor = processExecutor;
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
if (tmpProcessExecutor != null && !tmpProcessExecutor.isProcessFinished()) {
tmpProcessExecutor.killProcess();
}
}
}));
logger.info("Local Nodes command started : " + obfuscatedCmd);
} catch (IOException e) {
String lf = System.lineSeparator();
String mess = "Cannot launch rm node " + baseNodeName + lf + Throwables.getStackTraceAsString(e);
multipleDeclareDeployingNodeLost(depNodeURLs, mess);
if (processExecutor != null) {
cleanProcess(processExecutor);
}
return;
}
// watching process
int threshold = 10;
while (!allNodesAcquiredOrLost()) {
if (processExecutor.isProcessFinished()) {
int exit = processExecutor.getExitCode();
if (exit != 0) {
String lf = System.lineSeparator();
String message = "RMNode exit code == " + exit + lf;
message += "Command: " + obfuscatedCmd + lf;
String out = Joiner.on('\n').join(processExecutor.getOutput());
String err = Joiner.on('\n').join(processExecutor.getErrorOutput());
message += "stdout: " + out + lf + "stderr: " + err;
multipleDeclareDeployingNodeLost(depNodeURLs, message);
}
} else {
logger.debug("Waiting for nodes " + baseNodeName + " acquisition");
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
logger.warn("Interrupted while waiting for local process status", e);
threshold--;
if (threshold <= 0) {
break;
}
}
}
logger.debug("Local Infrastructure manager exits watching loop for nodes " + baseNodeName);
logNodeOutput(baseNodeName + " stdout: ", processExecutor.getOutput());
logNodeOutput(baseNodeName + " stderr: ", processExecutor.getErrorOutput());
if (allNodesLost(numberOfNodes)) {
// clean up the process
cleanProcess(processExecutor);
}
}
use of org.ow2.proactive.resourcemanager.utils.CommandLineBuilder in project scheduling by ow2-proactive.
the class SSHInfrastructure method startNodeImpl.
/**
* Internal node acquisition method
* <p>
* Starts a PA runtime on remote host using SSH, register it manually in the
* nodesource.
*
* @param hostTracker The host on which one the node will be started
* @param nbNodes number of nodes to deploy
* @param depNodeURLs list of deploying or lost nodes urls created
* @throws RMException
* acquisition failed
*/
protected void startNodeImpl(HostTracker hostTracker, int nbNodes, final List<String> depNodeURLs) throws RMException {
String fs = getTargetOSObj().fs;
CommandLineBuilder clb = super.getDefaultCommandLineBuilder(getTargetOSObj());
// we take care of spaces in java path
clb.setJavaPath(this.javaPath);
// we set the rm.home prop
clb.setRmHome(schedulingPath);
// we set the java security policy file
StringBuilder sb = new StringBuilder();
final boolean containsSpace = schedulingPath.contains(" ");
String securitycmd = CentralPAPropertyRepository.JAVA_SECURITY_POLICY.getCmdLine();
if (!this.javaOptions.contains(securitycmd)) {
sb.append(securitycmd);
if (containsSpace) {
sb.append("\"");
}
sb.append(schedulingPath);
sb.append(fs);
sb.append("config");
sb.append(fs);
sb.append("security.java.policy-client");
if (containsSpace) {
sb.append("\"");
}
sb.append(" ");
}
// we set the log4j configuration file
String log4jcmd = CentralPAPropertyRepository.LOG4J.getCmdLine();
if (!this.javaOptions.contains(log4jcmd)) {
sb.append(log4jcmd);
if (containsSpace) {
sb.append("\"");
}
// log4j only understands urls
sb.append("file:");
if (!schedulingPath.startsWith("/")) {
sb.append("/" + schedulingPath.replace("\\", "/"));
} else {
sb.append(schedulingPath.replace("\\", "/"));
}
sb.append("/");
sb.append("config");
sb.append("/");
sb.append("log");
sb.append("/");
sb.append("node.properties");
if (containsSpace) {
sb.append("\"");
}
sb.append(" ");
}
// we add extra java/PA configuration
sb.append(this.javaOptions);
clb.setPaProperties(sb.toString());
// afterwards, node's name
// generate the node name
// current rmcore shortID should be added to ensure uniqueness
final String nodeName = nodeNameBuilder.generateNodeName(hostTracker);
clb.setNodeName(nodeName);
clb.setNumberOfNodes(nbNodes);
// finally, the credential's value
String credString = null;
try {
credString = new String(getCredentials().getBase64());
} catch (KeyException e1) {
throw new RMException("Could not get base64 credentials", e1);
}
clb.setCredentialsValueAndNullOthers(credString);
// add an expected node. every unexpected node will be discarded
String cmdLine;
String obfuscatedCmdLine;
try {
cmdLine = clb.buildCommandLine(true);
obfuscatedCmdLine = clb.buildCommandLine(false);
} catch (IOException e2) {
throw new RMException("Cannot build the " + RMNodeStarter.class.getSimpleName() + "'s command line.", e2);
}
// one escape the command to make it runnable through ssh
if (cmdLine.contains("\"")) {
cmdLine = cmdLine.replaceAll("\"", "\\\\\"");
}
// we create a new deploying node before ssh command ran
final List<String> createdNodeNames = RMNodeStarter.getWorkersNodeNames(nodeName, nbNodes);
depNodeURLs.addAll(addMultipleDeployingNodes(createdNodeNames, obfuscatedCmdLine, "Deploying nodes on host " + hostTracker.getResolvedAddress(), super.nodeTimeOut));
addTimeouts(depNodeURLs);
Process p = null;
try {
p = Utils.runSSHCommand(hostTracker.getResolvedAddress(), cmdLine, sshOptions);
} catch (IOException e1) {
multipleDeclareDeployingNodeLost(depNodeURLs, "Cannot run command: " + cmdLine + ", with ssh options: " + sshOptions + " -\n The following exception occutred:\n " + getStackTraceAsString(e1));
throw new RMException("Cannot run command: " + cmdLine + ", with ssh options: " + sshOptions, e1);
}
String lf = System.lineSeparator();
int circuitBreakerThreshold = 5;
while (!anyTimedOut(depNodeURLs) && circuitBreakerThreshold > 0) {
try {
int exitCode = p.exitValue();
if (exitCode != 0) {
logger.error("SSH subprocess at " + hostTracker.getResolvedAddress().getHostName() + " exited abnormally (" + exitCode + ").");
} else {
logger.error("Launching node process has exited normally whereas it shouldn't.");
}
String pOutPut = Utils.extractProcessOutput(p);
String pErrPut = Utils.extractProcessErrput(p);
final String description = "SSH command failed to launch node on host " + hostTracker.getResolvedAddress().getHostName() + lf + " >Error code: " + exitCode + lf + " >Errput: " + pErrPut + " >Output: " + pOutPut;
logger.error(description);
if (super.checkAllNodesAreAcquiredAndDo(createdNodeNames, null, new Runnable() {
public void run() {
SSHInfrastructure.this.multipleDeclareDeployingNodeLost(depNodeURLs, description);
}
})) {
return;
} else {
// there isn't any race regarding node registration
throw new RMException("SSH Node " + nodeName + " is not expected anymore because of an error.");
}
} catch (IllegalThreadStateException e) {
logger.trace("IllegalThreadStateException while waiting for " + nodeName + " registration");
}
if (super.checkNodeIsAcquiredAndDo(nodeName, null, null)) {
// registration is ok, we destroy the process
p.destroy();
return;
}
try {
Thread.sleep(1000);
} catch (Exception e) {
circuitBreakerThreshold--;
logger.trace("An exception occurred while monitoring ssh subprocess", e);
}
}
// if we exit because of a timeout
if (anyTimedOut(depNodeURLs)) {
// we remove it
removeTimeouts(depNodeURLs);
// we destroy the process
p.destroy();
throw new RMException("Deploying Node " + nodeName + " not expected any more");
}
if (circuitBreakerThreshold <= 0) {
logger.error("Circuit breaker threshold reached while monitoring ssh subprocess.");
throw new RMException("Several exceptions occurred while monitoring ssh subprocess.");
}
}
Aggregations