Search in sources :

Example 1 with Job

use of org.gridlab.gat.resources.Job in project compss by bsc-wdc.

the class GATJob method processMetricEvent.

// MetricListener interface implementation
@Override
public void processMetricEvent(MetricEvent value) {
    Job job = (Job) value.getSource();
    JobState newJobState = (JobState) value.getValue();
    JobDescription jd = (JobDescription) job.getJobDescription();
    SoftwareDescription sd = jd.getSoftwareDescription();
    Integer jobId = (Integer) sd.getAttributes().get("jobId");
    logger.debug("Processing job ID = " + jobId);
    /*
         * Check if either the job has finished or there has been a submission error. We don't care about other state
         * transitions
         */
    if (newJobState == JobState.STOPPED) {
        if (Tracer.isActivated()) {
            Integer slot = (Integer) sd.getAttributes().get("slot");
            String host = getResourceNode().getHost();
            Tracer.freeSlot(host, slot);
        }
        /*
             * We must check whether the chosen adaptor is globus In that case, since globus doesn't provide the exit
             * status of a job, we must examine the standard error file
             */
        try {
            if (usingGlobus) {
                File errFile = sd.getStderr();
                // Error file should always be in the same host as the IT
                File localFile = GAT.createFile(context, errFile.toGATURI());
                if (localFile.length() > 0) {
                    GATjob = null;
                    RUNNING_JOBS.remove(this);
                    ErrorManager.warn("Error when creating file.");
                    listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
                } else {
                    if (!debug) {
                        localFile.delete();
                    }
                    RUNNING_JOBS.remove(this);
                    listener.jobCompleted(this);
                }
            } else {
                if (job.getExitStatus() == 0) {
                    RUNNING_JOBS.remove(this);
                    listener.jobCompleted(this);
                } else {
                    GATjob = null;
                    RUNNING_JOBS.remove(this);
                    listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
                }
            }
        } catch (Exception e) {
            ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
        }
    } else if (newJobState == JobState.SUBMISSION_ERROR) {
        if (Tracer.isActivated()) {
            Integer slot = (Integer) sd.getAttributes().get("slot");
            String host = getResourceNode().getHost();
            Tracer.freeSlot(host, slot);
        }
        try {
            if (usingGlobus && job.getInfo().get("resManError").equals("NO_ERROR")) {
                RUNNING_JOBS.remove(this);
                listener.jobCompleted(this);
            } else {
                GATjob = null;
                RUNNING_JOBS.remove(this);
                listener.jobFailed(this, JobEndStatus.SUBMISSION_FAILED);
            }
        } catch (GATInvocationException e) {
            ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
        }
    }
}
Also used : JobDescription(org.gridlab.gat.resources.JobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) JobState(org.gridlab.gat.resources.Job.JobState) Job(org.gridlab.gat.resources.Job) File(org.gridlab.gat.io.File) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) GATInvocationException(org.gridlab.gat.GATInvocationException)

Example 2 with Job

use of org.gridlab.gat.resources.Job in project compss by bsc-wdc.

the class GATJob method submit.

@Override
public void submit() throws Exception {
    // Prepare the job
    logger.info("Submit GATJob with ID " + jobId);
    JobDescription jobDescr = null;
    jobDescr = prepareJob();
    // Get a broker for the host
    ResourceBroker broker = null;
    String dest = (String) jobDescr.getResourceDescription().getResourceAttribute(RES_ATTR);
    if ((broker = brokers.get(dest)) == null) {
        broker = GAT.createResourceBroker(context, new URI(dest));
        brokers.put(dest, broker);
    }
    // Submit the job, registering for notifications of job state
    // transitions (associatedJM is the metric listener)
    Job job = null;
    try {
        job = broker.submitJob(jobDescr, this, JOB_STATUS);
        RUNNING_JOBS.add(this);
    } catch (Exception e) {
        if (Tracer.isActivated()) {
            Tracer.freeSlot(((GATWorkerNode) worker.getNode()).getHost(), (Integer) jobDescr.getSoftwareDescription().getAttributes().get("slot"));
        }
        throw e;
    }
    // Update mapping
    GATjob = job;
}
Also used : JobDescription(org.gridlab.gat.resources.JobDescription) ResourceBroker(org.gridlab.gat.resources.ResourceBroker) Job(org.gridlab.gat.resources.Job) URI(org.gridlab.gat.URI) GATInvocationException(org.gridlab.gat.GATInvocationException)

Example 3 with Job

use of org.gridlab.gat.resources.Job in project compss by bsc-wdc.

the class GATScriptExecutor method executeScript.

public boolean executeScript(List<URI> scripts, List<String> params, String stdOutFileName) {
    try {
        pool.startThreads();
    } catch (Exception e) {
        logger.error(THREAD_POOL_START_ERR, e);
        return false;
    }
    synchronized (jobQueue) {
        jobCount = scripts.size();
    }
    for (int i = 0; i < scripts.size(); i++) {
        URI script = scripts.get(i);
        String cleanParam = params.get(i);
        if (script == null) {
            continue;
        }
        if (debug) {
            logger.debug("Clean call: " + script + " " + cleanParam);
        }
        try {
            if (!node.isUserNeeded() && script.getUserInfo() != null) {
                // Remove user from the URI
                script.setUserInfo(null);
            }
            String user = script.getUserInfo();
            if (user == null) {
                user = "";
            } else {
                user += "@";
            }
            SoftwareDescription sd = new SoftwareDescription();
            sd.addAttribute("uri", Protocol.ANY_URI.getSchema() + user + script.getHost());
            sd.setExecutable(script.getPath());
            sd.setArguments(cleanParam.split(" "));
            sd.addAttribute("job_number", i);
            sd.addAttribute(SoftwareDescription.SANDBOX_ROOT, File.separator + "tmp" + File.separator);
            sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
            sd.addAttribute(SoftwareDescription.SANDBOX_DELETE, "false");
            if (debug) {
                try {
                    org.gridlab.gat.io.File outFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out");
                    sd.setStdout(outFile);
                    org.gridlab.gat.io.File errFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err");
                    sd.setStderr(errFile);
                } catch (Exception e) {
                    logger.error(CLEAN_JOB_ERR, e);
                }
            }
            sdQueue.enqueue(sd);
        } catch (Exception e) {
            logger.error(CLEAN_JOB_ERR, e);
            return false;
        }
    }
    Long timeout = System.currentTimeMillis() + 60_000l;
    // Poll for completion of the clean jobs
    while (jobCount > 0 && System.currentTimeMillis() < timeout) {
        Job job = jobQueue.dequeue();
        if (job == null) {
            synchronized (jobQueue) {
                jobCount--;
            }
        } else if (job.getState() == JobState.STOPPED) {
            synchronized (jobQueue) {
                jobCount--;
            }
        } else if (job.getState() == JobState.SUBMISSION_ERROR) {
            logger.error(CLEAN_JOB_ERR + ": " + job);
            synchronized (jobQueue) {
                jobCount--;
            }
        } else {
            jobQueue.enqueue(job);
            try {
                Thread.sleep(50);
            } catch (Exception e) {
            }
        }
    }
    try {
        pool.stopThreads();
    } catch (Exception e) {
        logger.error(THREAD_POOL_STOP_ERR, e);
        return false;
    }
    // Move cleanX.out logs to default logger
    if (debug) {
        String stdOutFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out";
        try (FileReader cleanOut = new FileReader(stdOutFilePath);
            BufferedReader br = new BufferedReader(cleanOut)) {
            String line = br.readLine();
            while (line != null) {
                logger.debug(line);
                line = br.readLine();
            }
        } catch (Exception e) {
            logger.error("Error moving std out file", e);
        }
        // Delete file
        if (!new File(stdOutFilePath).delete()) {
            logger.error("Error deleting out file " + stdOutFilePath);
        }
    }
    // Move cleanX.err logs to default logger
    if (debug) {
        String stdErrFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err";
        try (FileReader cleanErr = new FileReader(stdErrFilePath);
            BufferedReader br = new BufferedReader(cleanErr)) {
            String line = br.readLine();
            while (line != null) {
                logger.error(line);
                line = br.readLine();
            }
        } catch (Exception e) {
            logger.error("Error moving std err file", e);
        }
        if (!new File(stdErrFilePath).delete()) {
            logger.error("Error deleting err file " + stdErrFilePath);
        }
    }
    return true;
}
Also used : URI(org.gridlab.gat.URI) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) Job(org.gridlab.gat.resources.Job) File(java.io.File)

Example 4 with Job

use of org.gridlab.gat.resources.Job in project compss by bsc-wdc.

the class LsfResourceBrokerAdaptor method submitJob.

/*
     * (non-Javadoc)
     * 
     * @see org.gridlab.gat.resources.ResourceBroker#submitJob(org.gridlab.gat.resources.JobDescription)
     */
public Job submitJob(AbstractJobDescription abstractDescription, MetricListener listener, String metricDefinitionName) throws GATInvocationException {
    if (!(abstractDescription instanceof JobDescription)) {
        throw new GATInvocationException("can only handle JobDescriptions: " + abstractDescription.getClass());
    }
    JobDescription description = (JobDescription) abstractDescription;
    SoftwareDescription sd = description.getSoftwareDescription();
    if (sd == null) {
        throw new GATInvocationException("The job description does not contain a software description");
    }
    if (description.getProcessCount() < 1) {
        throw new GATInvocationException("Adaptor cannot handle: process count < 1: " + description.getProcessCount());
    }
    if (description.getResourceCount() != 1) {
        throw new GATInvocationException("Adaptor cannot handle: resource count > 1: " + description.getResourceCount());
    }
    String home = System.getProperty("user.home");
    if (home == null) {
        throw new GATInvocationException("lsf broker could not get user home dir");
    }
    Sandbox sandbox = new Sandbox(gatContext, description, "localhost", home, true, true, false, false);
    LsfJob lsfJob = new LsfJob(gatContext, description, sandbox);
    Job job = null;
    if (description instanceof WrapperJobDescription) {
        WrapperJobCpi tmp = new WrapperJobCpi(gatContext, lsfJob, listener, metricDefinitionName);
        listener = tmp;
        job = tmp;
    } else {
        job = lsfJob;
    }
    if (listener != null && metricDefinitionName != null) {
        Metric metric = lsfJob.getMetricDefinitionByName(metricDefinitionName).createMetric(null);
        lsfJob.addMetricListener(listener, metric);
    }
    lsfJob.setState(Job.JobState.PRE_STAGING);
    lsfJob.waitForTrigger(Job.JobState.PRE_STAGING);
    sandbox.prestage();
    String exe;
    if (sandbox.getResolvedExecutable() != null) {
        exe = sandbox.getResolvedExecutable().getPath();
    // try to set the executable bit, it might be lost
    /* CDIAZ: The command "exe" can be also in a remote host
             * 		  The command must have the right permissions in the remote host
            try {
                new CommandRunner("chmod", "+x", exe);
            } catch (Throwable t) {
                // ignore
            }
            */
    } else {
        exe = getExecutable(description);
    }
    String[] args = getArgumentsArray(description);
    // Directory where the lsf command will be executed
    java.io.File f = new java.io.File(sandbox.getSandboxPath());
    if (!f.exists()) {
        throw new GATInvocationException("Unable to find directory " + f.getAbsolutePath());
    }
    // Check and set the environment for a blaunch command
    Map<String, Object> env = sd.getEnvironment();
    this.prepareBLaunchEnv(env);
    // Encapsulate the original command into a blaunch command
    String host = brokerURI.getHost();
    String blExe = this.getBlaunchCommand();
    String[] blArgs = this.getBlaunchArgs(host, exe, args);
    ProcessBundle bundle = new ProcessBundle(description.getProcessCount(), blExe, blArgs, f, env);
    lsfJob.setSubmissionTime();
    lsfJob.setState(Job.JobState.SCHEDULED);
    try {
        lsfJob.setState(Job.JobState.RUNNING);
        lsfJob.waitForTrigger(Job.JobState.RUNNING);
        lsfJob.setStartTime();
        bundle.startBundle();
        lsfJob.setProcess(bundle);
    } catch (IOException e) {
        throw new CommandNotFoundException("LsfResourceBrokerAdaptor", e);
    }
    if (!sd.streamingStderrEnabled()) {
        try {
            if (sd.getStderr() != null) {
                OutputStream err = GAT.createFileOutputStream(gatContext, sd.getStderr());
                // to file
                StreamForwarder forwarder = new StreamForwarder(bundle.getStderr(), err, sd.getExecutable() + " [stderr]");
                lsfJob.setErrorStream(forwarder);
                if (logger.isDebugEnabled()) {
                    logger.debug("Created stderr forwarder to file " + sd.getStderr());
                }
            } else {
                // or throw it away
                new StreamForwarder(bundle.getStderr(), null, sd.getExecutable() + " [stderr]");
            }
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file output stream for stderr!", e);
        }
    }
    if (!sd.streamingStdoutEnabled()) {
        // read away the stdout
        try {
            if (sd.getStdout() != null) {
                // to file
                OutputStream out = GAT.createFileOutputStream(gatContext, sd.getStdout());
                StreamForwarder forwarder = new StreamForwarder(bundle.getStdout(), out, sd.getExecutable() + " [stdout]");
                lsfJob.setOutputStream(forwarder);
                if (logger.isDebugEnabled()) {
                    logger.debug("Created stdout forwarder to file " + sd.getStdout());
                }
            } else {
                // or throw it away
                new StreamForwarder(bundle.getStdout(), null, sd.getExecutable() + " [stdout]");
            }
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file output stream for stdout!", e);
        }
    }
    if (!sd.streamingStdinEnabled() && sd.getStdin() != null) {
        // forward the stdin from file
        try {
            InputStream in = GAT.createFileInputStream(gatContext, sd.getStdin());
            bundle.setStdin(sd.getExecutable(), in);
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file input stream for stdin!", e);
        }
    }
    lsfJob.monitorState();
    return job;
}
Also used : ProcessBundle(org.gridlab.gat.engine.util.ProcessBundle) GATObjectCreationException(org.gridlab.gat.GATObjectCreationException) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) StreamForwarder(org.gridlab.gat.engine.util.StreamForwarder) IOException(java.io.IOException) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) Sandbox(org.gridlab.gat.resources.cpi.Sandbox) WrapperJobDescription(org.gridlab.gat.resources.WrapperJobDescription) JobDescription(org.gridlab.gat.resources.JobDescription) AbstractJobDescription(org.gridlab.gat.resources.AbstractJobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) WrapperJobCpi(org.gridlab.gat.resources.cpi.WrapperJobCpi) Metric(org.gridlab.gat.monitoring.Metric) Job(org.gridlab.gat.resources.Job) WrapperJobDescription(org.gridlab.gat.resources.WrapperJobDescription) CommandNotFoundException(org.gridlab.gat.CommandNotFoundException)

Example 5 with Job

use of org.gridlab.gat.resources.Job in project compss by bsc-wdc.

the class SshLSFJob method getJobState.

protected void getJobState(String jobID) throws GATInvocationException {
    synchronized (this) {
        while (jobStateBusy) {
            try {
                wait();
            } catch (InterruptedException e) {
            // ignored
            }
        }
        jobStateBusy = true;
    }
    JobState resultState;
    try {
        if (state == JobState.POST_STAGING || state == JobState.STOPPED || state == JobState.SUBMISSION_ERROR) {
            return;
        }
        logger.debug("Getting task status in setState()");
        // getting the status via ssh ... squeue
        java.io.File squeueResultFile = null;
        try {
            // Create qstat job
            SoftwareDescription sd = new SoftwareDescription();
            // Use /bin/sh, so that $USER gets expanded.
            sd.setExecutable("/bin/sh");
            sd.setArguments("-c", "bjobs -noheader " + jobID + " | awk {' print $3 '}");
            sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
            squeueResultFile = java.io.File.createTempFile("GAT", "tmp");
            try {
                sd.setStdout(GAT.createFile(subContext, new URI("file:///" + squeueResultFile.getAbsolutePath().replace(File.separatorChar, '/'))));
            } catch (Throwable e1) {
                throw new GATInvocationException("Could not create GAT object for temporary " + squeueResultFile.getAbsolutePath(), e1);
            }
            JobDescription jd = new JobDescription(sd);
            Job job = jobHelper.submitJob(jd, this, "job.status");
            synchronized (job) {
                while (job.getState() != Job.JobState.STOPPED && job.getState() != Job.JobState.SUBMISSION_ERROR) {
                    try {
                        job.wait();
                    } catch (InterruptedException e) {
                    // ignore
                    }
                }
            }
            if (job.getState() != Job.JobState.STOPPED || job.getExitStatus() != 0) {
                throw new GATInvocationException("Could not submit squeue job " + sd.toString());
            }
            // submit success.
            BufferedReader in = new BufferedReader(new FileReader(squeueResultFile.getAbsolutePath()));
            String status = in.readLine();
            // or finished. Set to "" in this case. --Ceriel
            if (status == null) {
                status = "";
            }
            if (logger.isDebugEnabled()) {
                logger.debug("squeue line: " + status);
            }
            resultState = mapLSFStatetoGAT(status);
        } catch (IOException e) {
            logger.debug("retrieving job status sshslurmjob failed", e);
            throw new GATInvocationException("Unable to retrieve the Job Status", e);
        } finally {
            squeueResultFile.delete();
        }
    } finally {
        synchronized (this) {
            jobStateBusy = false;
            notifyAll();
        }
    }
    if (resultState != JobState.STOPPED) {
        setState(resultState);
    } else {
        setState(JobState.POST_STAGING);
    }
}
Also used : IOException(java.io.IOException) URI(org.gridlab.gat.URI) File(java.io.File) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) JobDescription(org.gridlab.gat.resources.JobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SshLSFJob(org.gridlab.gat.resources.cpi.sshlsf.SshLSFJob) Job(org.gridlab.gat.resources.Job)

Aggregations

Job (org.gridlab.gat.resources.Job)27 JobDescription (org.gridlab.gat.resources.JobDescription)25 SoftwareDescription (org.gridlab.gat.resources.SoftwareDescription)24 URI (org.gridlab.gat.URI)23 GATInvocationException (org.gridlab.gat.GATInvocationException)20 ResourceBroker (org.gridlab.gat.resources.ResourceBroker)19 GATObjectCreationException (org.gridlab.gat.GATObjectCreationException)14 URISyntaxException (java.net.URISyntaxException)13 IOException (java.io.IOException)10 BufferedReader (java.io.BufferedReader)7 Preferences (org.gridlab.gat.Preferences)6 File (org.gridlab.gat.io.File)5 File (java.io.File)4 InputStreamReader (java.io.InputStreamReader)4 AbstractJobDescription (org.gridlab.gat.resources.AbstractJobDescription)4 WrapperJobDescription (org.gridlab.gat.resources.WrapperJobDescription)4 FileReader (java.io.FileReader)3 InputStream (java.io.InputStream)3 GATContext (org.gridlab.gat.GATContext)3 Metric (org.gridlab.gat.monitoring.Metric)3