use of es.bsc.compss.exceptions.InitNodeException in project compss by bsc-wdc.
the class WorkerStarter method killPreviousWorker.
private void killPreviousWorker(String user, String name, int pid) throws InitNodeException {
if (pid != -1) {
// Command was started but it is not possible to contact to the worker
String[] command = getStopCommand(pid);
ProcessOut po = executeCommand(user, name, command);
if (po == null) {
// Queue System managed worker starter
LOGGER.error("[START_CMD_ERROR]: An Error has occurred when queue system started NIO worker in resource " + name + ". Retries not available in this option.");
throw new InitNodeException("[START_CMD_ERROR]: An Error has occurred when queue system started NIO worker in resource " + name + ". Retries not available in this option.");
} else if (po.getExitValue() != 0) {
// Normal starting process
LOGGER.error(ERROR_SHUTTING_DOWN_RETRY);
}
}
}
use of es.bsc.compss.exceptions.InitNodeException in project compss by bsc-wdc.
the class WorkerStarter method startWorker.
/**
* Starts the current worker
*
* @return
* @throws InitNodeException
*/
public NIONode startWorker() throws InitNodeException {
String name = this.nw.getName();
String user = this.nw.getUser();
int minPort = this.nw.getConfiguration().getMinPort();
int maxPort = this.nw.getConfiguration().getMaxPort();
int port = minPort;
// Solves exit error 143
synchronized (addressToWorkerStarter) {
addressToWorkerStarter.put(name, this);
LOGGER.debug("[WorkerStarter] Worker starter for " + name + " registers in the hashmap");
}
NIONode n = null;
int pid = -1;
while (port <= maxPort && !this.toStop) {
// Kill previous worker processes if any
killPreviousWorker(user, name, pid);
// Instantiate the node
n = new NIONode(name, port);
// Start the worker
pid = startWorker(user, name, port);
// Check worker status
LOGGER.info("[WorkerStarter] Worker process started. Checking connectivity...");
checkWorker(n, name);
// Check received ack
LOGGER.debug("[WorkerStarter] Retries for " + name + " have finished.");
if (!this.workerIsReady) {
// Try next port
++port;
} else {
// Success, return node
try {
Runtime.getRuntime().addShutdownHook(new Ender(this, this.nw, pid));
} catch (IllegalStateException e) {
LOGGER.warn("Tried to shutdown vm while it was already being shutdown", e);
}
return n;
}
}
// This can be because node is stopping or because we reached the maximum available ports
if (this.toStop) {
String msg = "[STOP]: Worker " + name + " stopped during creation because application is stopped";
LOGGER.warn(msg);
throw new InitNodeException(msg);
} else if (!this.workerIsReady) {
String msg = "[TIMEOUT]: Could not start the NIO worker on resource " + name + " through user " + user + ".";
LOGGER.warn(msg);
throw new InitNodeException(msg);
} else {
String msg = "[UNKNOWN]: Could not start the NIO worker on resource " + name + " through user " + user + ".";
LOGGER.warn(msg);
throw new InitNodeException(msg);
}
}
use of es.bsc.compss.exceptions.InitNodeException in project compss by bsc-wdc.
the class GATWorkerNode method initWorkingDir.
private void initWorkingDir() throws InitNodeException {
LinkedList<URI> traceScripts = new LinkedList<>();
LinkedList<String> traceParams = new LinkedList<>();
String host = getHost();
String installDir = getInstallDir();
String workingDir = getWorkingDir();
String user = getUser();
if (user == null || user.isEmpty()) {
user = "";
} else {
user += "@";
}
try {
String initScriptPath = Protocol.ANY_URI.getSchema() + user + host + File.separator + installDir + GAT_SCRIPT_PATH + INIT_SCRIPT_NAME;
traceScripts.add(new URI(initScriptPath));
} catch (URISyntaxException e) {
new InitNodeException("Error addind initScript");
}
String pars = workingDir;
traceParams.add(pars);
// Use cleaner to run the trace script and generate the package
LOGGER.debug("Initializing working dir " + workingDir + " in host " + getName());
boolean result = new GATScriptExecutor(this).executeScript(traceScripts, traceParams, "init_" + host);
if (!result) {
throw new InitNodeException("Error executing init script for initializing working dir " + workingDir + " in host " + getName());
}
}
use of es.bsc.compss.exceptions.InitNodeException in project compss by bsc-wdc.
the class NIOWorkerNode method start.
@Override
public void start() throws InitNodeException {
NIONode n = null;
try {
this.workerStarter = new WorkerStarter(this);
n = this.workerStarter.startWorker();
} catch (InitNodeException e) {
ErrorManager.warn("There was an exception when initiating worker " + getName() + ".", e);
throw e;
}
this.node = n;
this.started = true;
if (NIOTracer.isActivated()) {
LOGGER.debug("Initializing NIO tracer " + this.getName());
NIOTracer.startTracing(this.getName(), this.getUser(), this.getHost(), this.getLimitOfTasks());
}
}
use of es.bsc.compss.exceptions.InitNodeException in project compss by bsc-wdc.
the class WorkerStarter method startWorker.
private int startWorker(String user, String name, int port) throws InitNodeException {
// Initial wait
try {
Thread.sleep(START_WORKER_INITIAL_WAIT);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
long timer = START_WORKER_INITIAL_WAIT;
// Try to launch the worker until we receive the PID or we timeout
int pid = -1;
String[] command = getStartCommand(port);
do {
ProcessOut po = executeCommand(user, name, command);
if (po == null) {
// Queue System managed worker starter
LOGGER.debug("Worker process started in resource " + name + " by queue system.");
pid = 0;
} else if (po.getExitValue() == 0) {
// Success
String output = po.getOutput();
String[] lines = output.split("\n");
pid = Integer.parseInt(lines[lines.length - 1]);
} else {
if (timer > MAX_WAIT_FOR_SSH) {
// Timeout
throw new InitNodeException("[START_CMD_ERROR]: Could not start the NIO worker in resource " + name + " through user " + user + ".\n" + "OUTPUT:" + po.getOutput() + "\n" + "ERROR:" + po.getError() + "\n");
}
LOGGER.warn(" Worker process failed to start in resource " + name + ". Retrying...");
}
// Sleep between retries
try {
Thread.sleep(4 * WAIT_TIME_UNIT);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
timer = timer + (4 * WAIT_TIME_UNIT);
} while (pid < 0);
return pid;
}
Aggregations