use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.
the class GATJob method processMetricEvent.
// MetricListener interface implementation
@Override
public void processMetricEvent(MetricEvent value) {
Job job = (Job) value.getSource();
JobState newJobState = (JobState) value.getValue();
JobDescription jd = (JobDescription) job.getJobDescription();
SoftwareDescription sd = jd.getSoftwareDescription();
Integer jobId = (Integer) sd.getAttributes().get("jobId");
logger.debug("Processing job ID = " + jobId);
/*
* Check if either the job has finished or there has been a submission error. We don't care about other state
* transitions
*/
if (newJobState == JobState.STOPPED) {
if (Tracer.isActivated()) {
Integer slot = (Integer) sd.getAttributes().get("slot");
String host = getResourceNode().getHost();
Tracer.freeSlot(host, slot);
}
/*
* We must check whether the chosen adaptor is globus In that case, since globus doesn't provide the exit
* status of a job, we must examine the standard error file
*/
try {
if (usingGlobus) {
File errFile = sd.getStderr();
// Error file should always be in the same host as the IT
File localFile = GAT.createFile(context, errFile.toGATURI());
if (localFile.length() > 0) {
GATjob = null;
RUNNING_JOBS.remove(this);
ErrorManager.warn("Error when creating file.");
listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
} else {
if (!debug) {
localFile.delete();
}
RUNNING_JOBS.remove(this);
listener.jobCompleted(this);
}
} else {
if (job.getExitStatus() == 0) {
RUNNING_JOBS.remove(this);
listener.jobCompleted(this);
} else {
GATjob = null;
RUNNING_JOBS.remove(this);
listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
}
}
} catch (Exception e) {
ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
}
} else if (newJobState == JobState.SUBMISSION_ERROR) {
if (Tracer.isActivated()) {
Integer slot = (Integer) sd.getAttributes().get("slot");
String host = getResourceNode().getHost();
Tracer.freeSlot(host, slot);
}
try {
if (usingGlobus && job.getInfo().get("resManError").equals("NO_ERROR")) {
RUNNING_JOBS.remove(this);
listener.jobCompleted(this);
} else {
GATjob = null;
RUNNING_JOBS.remove(this);
listener.jobFailed(this, JobEndStatus.SUBMISSION_FAILED);
}
} catch (GATInvocationException e) {
ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
}
}
}
use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.
the class GATScriptExecutor method executeScript.
public boolean executeScript(List<URI> scripts, List<String> params, String stdOutFileName) {
try {
pool.startThreads();
} catch (Exception e) {
logger.error(THREAD_POOL_START_ERR, e);
return false;
}
synchronized (jobQueue) {
jobCount = scripts.size();
}
for (int i = 0; i < scripts.size(); i++) {
URI script = scripts.get(i);
String cleanParam = params.get(i);
if (script == null) {
continue;
}
if (debug) {
logger.debug("Clean call: " + script + " " + cleanParam);
}
try {
if (!node.isUserNeeded() && script.getUserInfo() != null) {
// Remove user from the URI
script.setUserInfo(null);
}
String user = script.getUserInfo();
if (user == null) {
user = "";
} else {
user += "@";
}
SoftwareDescription sd = new SoftwareDescription();
sd.addAttribute("uri", Protocol.ANY_URI.getSchema() + user + script.getHost());
sd.setExecutable(script.getPath());
sd.setArguments(cleanParam.split(" "));
sd.addAttribute("job_number", i);
sd.addAttribute(SoftwareDescription.SANDBOX_ROOT, File.separator + "tmp" + File.separator);
sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
sd.addAttribute(SoftwareDescription.SANDBOX_DELETE, "false");
if (debug) {
try {
org.gridlab.gat.io.File outFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out");
sd.setStdout(outFile);
org.gridlab.gat.io.File errFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err");
sd.setStderr(errFile);
} catch (Exception e) {
logger.error(CLEAN_JOB_ERR, e);
}
}
sdQueue.enqueue(sd);
} catch (Exception e) {
logger.error(CLEAN_JOB_ERR, e);
return false;
}
}
Long timeout = System.currentTimeMillis() + 60_000l;
// Poll for completion of the clean jobs
while (jobCount > 0 && System.currentTimeMillis() < timeout) {
Job job = jobQueue.dequeue();
if (job == null) {
synchronized (jobQueue) {
jobCount--;
}
} else if (job.getState() == JobState.STOPPED) {
synchronized (jobQueue) {
jobCount--;
}
} else if (job.getState() == JobState.SUBMISSION_ERROR) {
logger.error(CLEAN_JOB_ERR + ": " + job);
synchronized (jobQueue) {
jobCount--;
}
} else {
jobQueue.enqueue(job);
try {
Thread.sleep(50);
} catch (Exception e) {
}
}
}
try {
pool.stopThreads();
} catch (Exception e) {
logger.error(THREAD_POOL_STOP_ERR, e);
return false;
}
// Move cleanX.out logs to default logger
if (debug) {
String stdOutFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out";
try (FileReader cleanOut = new FileReader(stdOutFilePath);
BufferedReader br = new BufferedReader(cleanOut)) {
String line = br.readLine();
while (line != null) {
logger.debug(line);
line = br.readLine();
}
} catch (Exception e) {
logger.error("Error moving std out file", e);
}
// Delete file
if (!new File(stdOutFilePath).delete()) {
logger.error("Error deleting out file " + stdOutFilePath);
}
}
// Move cleanX.err logs to default logger
if (debug) {
String stdErrFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err";
try (FileReader cleanErr = new FileReader(stdErrFilePath);
BufferedReader br = new BufferedReader(cleanErr)) {
String line = br.readLine();
while (line != null) {
logger.error(line);
line = br.readLine();
}
} catch (Exception e) {
logger.error("Error moving std err file", e);
}
if (!new File(stdErrFilePath).delete()) {
logger.error("Error deleting err file " + stdErrFilePath);
}
}
return true;
}
use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.
the class LsfResourceBrokerAdaptor method submitJob.
/*
* (non-Javadoc)
*
* @see org.gridlab.gat.resources.ResourceBroker#submitJob(org.gridlab.gat.resources.JobDescription)
*/
public Job submitJob(AbstractJobDescription abstractDescription, MetricListener listener, String metricDefinitionName) throws GATInvocationException {
if (!(abstractDescription instanceof JobDescription)) {
throw new GATInvocationException("can only handle JobDescriptions: " + abstractDescription.getClass());
}
JobDescription description = (JobDescription) abstractDescription;
SoftwareDescription sd = description.getSoftwareDescription();
if (sd == null) {
throw new GATInvocationException("The job description does not contain a software description");
}
if (description.getProcessCount() < 1) {
throw new GATInvocationException("Adaptor cannot handle: process count < 1: " + description.getProcessCount());
}
if (description.getResourceCount() != 1) {
throw new GATInvocationException("Adaptor cannot handle: resource count > 1: " + description.getResourceCount());
}
String home = System.getProperty("user.home");
if (home == null) {
throw new GATInvocationException("lsf broker could not get user home dir");
}
Sandbox sandbox = new Sandbox(gatContext, description, "localhost", home, true, true, false, false);
LsfJob lsfJob = new LsfJob(gatContext, description, sandbox);
Job job = null;
if (description instanceof WrapperJobDescription) {
WrapperJobCpi tmp = new WrapperJobCpi(gatContext, lsfJob, listener, metricDefinitionName);
listener = tmp;
job = tmp;
} else {
job = lsfJob;
}
if (listener != null && metricDefinitionName != null) {
Metric metric = lsfJob.getMetricDefinitionByName(metricDefinitionName).createMetric(null);
lsfJob.addMetricListener(listener, metric);
}
lsfJob.setState(Job.JobState.PRE_STAGING);
lsfJob.waitForTrigger(Job.JobState.PRE_STAGING);
sandbox.prestage();
String exe;
if (sandbox.getResolvedExecutable() != null) {
exe = sandbox.getResolvedExecutable().getPath();
// try to set the executable bit, it might be lost
/* CDIAZ: The command "exe" can be also in a remote host
* The command must have the right permissions in the remote host
try {
new CommandRunner("chmod", "+x", exe);
} catch (Throwable t) {
// ignore
}
*/
} else {
exe = getExecutable(description);
}
String[] args = getArgumentsArray(description);
// Directory where the lsf command will be executed
java.io.File f = new java.io.File(sandbox.getSandboxPath());
if (!f.exists()) {
throw new GATInvocationException("Unable to find directory " + f.getAbsolutePath());
}
// Check and set the environment for a blaunch command
Map<String, Object> env = sd.getEnvironment();
this.prepareBLaunchEnv(env);
// Encapsulate the original command into a blaunch command
String host = brokerURI.getHost();
String blExe = this.getBlaunchCommand();
String[] blArgs = this.getBlaunchArgs(host, exe, args);
ProcessBundle bundle = new ProcessBundle(description.getProcessCount(), blExe, blArgs, f, env);
lsfJob.setSubmissionTime();
lsfJob.setState(Job.JobState.SCHEDULED);
try {
lsfJob.setState(Job.JobState.RUNNING);
lsfJob.waitForTrigger(Job.JobState.RUNNING);
lsfJob.setStartTime();
bundle.startBundle();
lsfJob.setProcess(bundle);
} catch (IOException e) {
throw new CommandNotFoundException("LsfResourceBrokerAdaptor", e);
}
if (!sd.streamingStderrEnabled()) {
try {
if (sd.getStderr() != null) {
OutputStream err = GAT.createFileOutputStream(gatContext, sd.getStderr());
// to file
StreamForwarder forwarder = new StreamForwarder(bundle.getStderr(), err, sd.getExecutable() + " [stderr]");
lsfJob.setErrorStream(forwarder);
if (logger.isDebugEnabled()) {
logger.debug("Created stderr forwarder to file " + sd.getStderr());
}
} else {
// or throw it away
new StreamForwarder(bundle.getStderr(), null, sd.getExecutable() + " [stderr]");
}
} catch (GATObjectCreationException e) {
throw new GATInvocationException("Unable to create file output stream for stderr!", e);
}
}
if (!sd.streamingStdoutEnabled()) {
// read away the stdout
try {
if (sd.getStdout() != null) {
// to file
OutputStream out = GAT.createFileOutputStream(gatContext, sd.getStdout());
StreamForwarder forwarder = new StreamForwarder(bundle.getStdout(), out, sd.getExecutable() + " [stdout]");
lsfJob.setOutputStream(forwarder);
if (logger.isDebugEnabled()) {
logger.debug("Created stdout forwarder to file " + sd.getStdout());
}
} else {
// or throw it away
new StreamForwarder(bundle.getStdout(), null, sd.getExecutable() + " [stdout]");
}
} catch (GATObjectCreationException e) {
throw new GATInvocationException("Unable to create file output stream for stdout!", e);
}
}
if (!sd.streamingStdinEnabled() && sd.getStdin() != null) {
// forward the stdin from file
try {
InputStream in = GAT.createFileInputStream(gatContext, sd.getStdin());
bundle.setStdin(sd.getExecutable(), in);
} catch (GATObjectCreationException e) {
throw new GATInvocationException("Unable to create file input stream for stdin!", e);
}
}
lsfJob.monitorState();
return job;
}
use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.
the class SshLSFJob method getJobState.
protected void getJobState(String jobID) throws GATInvocationException {
synchronized (this) {
while (jobStateBusy) {
try {
wait();
} catch (InterruptedException e) {
// ignored
}
}
jobStateBusy = true;
}
JobState resultState;
try {
if (state == JobState.POST_STAGING || state == JobState.STOPPED || state == JobState.SUBMISSION_ERROR) {
return;
}
logger.debug("Getting task status in setState()");
// getting the status via ssh ... squeue
java.io.File squeueResultFile = null;
try {
// Create qstat job
SoftwareDescription sd = new SoftwareDescription();
// Use /bin/sh, so that $USER gets expanded.
sd.setExecutable("/bin/sh");
sd.setArguments("-c", "bjobs -noheader " + jobID + " | awk {' print $3 '}");
sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
squeueResultFile = java.io.File.createTempFile("GAT", "tmp");
try {
sd.setStdout(GAT.createFile(subContext, new URI("file:///" + squeueResultFile.getAbsolutePath().replace(File.separatorChar, '/'))));
} catch (Throwable e1) {
throw new GATInvocationException("Could not create GAT object for temporary " + squeueResultFile.getAbsolutePath(), e1);
}
JobDescription jd = new JobDescription(sd);
Job job = jobHelper.submitJob(jd, this, "job.status");
synchronized (job) {
while (job.getState() != Job.JobState.STOPPED && job.getState() != Job.JobState.SUBMISSION_ERROR) {
try {
job.wait();
} catch (InterruptedException e) {
// ignore
}
}
}
if (job.getState() != Job.JobState.STOPPED || job.getExitStatus() != 0) {
throw new GATInvocationException("Could not submit squeue job " + sd.toString());
}
// submit success.
BufferedReader in = new BufferedReader(new FileReader(squeueResultFile.getAbsolutePath()));
String status = in.readLine();
// or finished. Set to "" in this case. --Ceriel
if (status == null) {
status = "";
}
if (logger.isDebugEnabled()) {
logger.debug("squeue line: " + status);
}
resultState = mapLSFStatetoGAT(status);
} catch (IOException e) {
logger.debug("retrieving job status sshslurmjob failed", e);
throw new GATInvocationException("Unable to retrieve the Job Status", e);
} finally {
squeueResultFile.delete();
}
} finally {
synchronized (this) {
jobStateBusy = false;
notifyAll();
}
}
if (resultState != JobState.STOPPED) {
setState(resultState);
} else {
setState(JobState.POST_STAGING);
}
}
use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.
the class SshLsfResourceBrokerAdaptor method createBsubScript.
private java.io.File createBsubScript(JobDescription description, String returnValueFile, int nproc) throws GATInvocationException {
// Adding bsub options
String Queue = null;
long Time = -1;
Integer cpus = null;
String jobname = null;
java.io.File temp;
LSFScriptWriter job = null;
HashMap<String, Object> rd_HashMap = null;
SoftwareDescription sd = description.getSoftwareDescription();
ResourceDescription rd = description.getResourceDescription();
// Corrected initialization of rd_HashMap: rd may be null ... --Ceriel
if (rd != null) {
rd_HashMap = (HashMap<String, Object>) rd.getDescription();
}
if (rd_HashMap == null) {
rd_HashMap = new HashMap<String, Object>();
}
// try {
temp = new java.io.File("lsf" + Math.random());
try {
job = new LSFScriptWriter(new BufferedWriter(new FileWriter(temp)));
String userScript = (String) gatContext.getPreferences().get(SSHLSF_SCRIPT);
if (userScript != null) {
// a specified job script overrides everything, except for
// pre-staging, post-staging,
// and exit status.
BufferedReader f = new BufferedReader(new FileReader(userScript));
for (; ; ) {
String s = f.readLine();
if (s == null) {
break;
}
job.print(s + "\n");
}
} else {
job.print("#!/bin/sh\n");
job.print("# bsub script automatically generated by GAT SshLsf adaptor\n");
// Resources: queue, walltime, memory size, et cetera.
Queue = (String) rd_HashMap.get("machine.queue");
if (Queue == null) {
Queue = sd.getStringAttribute(SoftwareDescription.JOB_QUEUE, null);
}
if (Queue != null) {
job.addOption("q", Queue);
}
Time = sd.getLongAttribute(SoftwareDescription.WALLTIME_MAX, -1L);
cpus = (Integer) rd_HashMap.get(HardwareResourceDescription.CPU_COUNT);
if (cpus == null) {
cpus = sd.getIntAttribute("coreCount", 1);
}
job.addOption("n", cpus);
// In a single node
job.addOption("R", "\"span[ptile=" + cpus + "]\"");
if (Time > 0) {
// Reformat time.
int minutes = (int) (Time % 60);
job.addOption("W", minutes);
} else {
job.addOption("W", 60);
}
String nativeFlags = null;
Object o = rd == null ? null : rd.getResourceAttribute(SSHLSF_NATIVE_FLAGS);
if (o != null && o instanceof String) {
nativeFlags = (String) o;
} else {
String s = sd == null ? null : sd.getStringAttribute(SSHLSF_NATIVE_FLAGS, null);
if (s != null) {
nativeFlags = s;
} else {
o = gatContext.getPreferences().get(SSHLSF_NATIVE_FLAGS);
if (o != null && o instanceof String) {
nativeFlags = (String) o;
}
}
}
if (nativeFlags != null) {
String[] splits = nativeFlags.split("##");
for (String s : splits) {
job.addString(s);
}
}
String path = sd.getStringAttribute(SoftwareDescription.SANDBOX_ROOT, "");
if (!path.isEmpty() && !path.endsWith(File.separator)) {
path = path + File.separator;
}
// Set working dir.
// job.addOption("cwd", path);
// Name for the job.
jobname = (String) rd_HashMap.get("Jobname");
if (jobname == null) {
jobname = brokerURI.getUserInfo();
if (jobname == null || "".equals(jobname)) {
jobname = "compss_remotejob_" + System.getProperty("user.name");
}
}
if (jobname != null)
job.addOption("J", jobname);
if (sd.getStdout() != null) {
job.addOption("oo", path + sd.getStdout().getName());
}
if (sd.getStderr() != null) {
job.addOption("eo", path + sd.getStderr().getName());
}
addScriptExecution(job, sd, rd);
}
job.print("echo retvalue = $? > " + returnValueFile + "\n");
} catch (Throwable e) {
throw new GATInvocationException("Cannot create temporary bsub file " + temp.getAbsolutePath(), e);
} finally {
if (job != null)
job.close();
}
return temp;
}
Aggregations