Search in sources :

Example 1 with SoftwareDescription

use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.

the class GATJob method processMetricEvent.

// MetricListener interface implementation
@Override
public void processMetricEvent(MetricEvent value) {
    Job job = (Job) value.getSource();
    JobState newJobState = (JobState) value.getValue();
    JobDescription jd = (JobDescription) job.getJobDescription();
    SoftwareDescription sd = jd.getSoftwareDescription();
    Integer jobId = (Integer) sd.getAttributes().get("jobId");
    logger.debug("Processing job ID = " + jobId);
    /*
         * Check if either the job has finished or there has been a submission error. We don't care about other state
         * transitions
         */
    if (newJobState == JobState.STOPPED) {
        if (Tracer.isActivated()) {
            Integer slot = (Integer) sd.getAttributes().get("slot");
            String host = getResourceNode().getHost();
            Tracer.freeSlot(host, slot);
        }
        /*
             * We must check whether the chosen adaptor is globus In that case, since globus doesn't provide the exit
             * status of a job, we must examine the standard error file
             */
        try {
            if (usingGlobus) {
                File errFile = sd.getStderr();
                // Error file should always be in the same host as the IT
                File localFile = GAT.createFile(context, errFile.toGATURI());
                if (localFile.length() > 0) {
                    GATjob = null;
                    RUNNING_JOBS.remove(this);
                    ErrorManager.warn("Error when creating file.");
                    listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
                } else {
                    if (!debug) {
                        localFile.delete();
                    }
                    RUNNING_JOBS.remove(this);
                    listener.jobCompleted(this);
                }
            } else {
                if (job.getExitStatus() == 0) {
                    RUNNING_JOBS.remove(this);
                    listener.jobCompleted(this);
                } else {
                    GATjob = null;
                    RUNNING_JOBS.remove(this);
                    listener.jobFailed(this, JobEndStatus.EXECUTION_FAILED);
                }
            }
        } catch (Exception e) {
            ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
        }
    } else if (newJobState == JobState.SUBMISSION_ERROR) {
        if (Tracer.isActivated()) {
            Integer slot = (Integer) sd.getAttributes().get("slot");
            String host = getResourceNode().getHost();
            Tracer.freeSlot(host, slot);
        }
        try {
            if (usingGlobus && job.getInfo().get("resManError").equals("NO_ERROR")) {
                RUNNING_JOBS.remove(this);
                listener.jobCompleted(this);
            } else {
                GATjob = null;
                RUNNING_JOBS.remove(this);
                listener.jobFailed(this, JobEndStatus.SUBMISSION_FAILED);
            }
        } catch (GATInvocationException e) {
            ErrorManager.fatal(CALLBACK_PROCESSING_ERR + ": " + this, e);
        }
    }
}
Also used : JobDescription(org.gridlab.gat.resources.JobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) JobState(org.gridlab.gat.resources.Job.JobState) Job(org.gridlab.gat.resources.Job) File(org.gridlab.gat.io.File) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) GATInvocationException(org.gridlab.gat.GATInvocationException)

Example 2 with SoftwareDescription

use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.

the class GATScriptExecutor method executeScript.

public boolean executeScript(List<URI> scripts, List<String> params, String stdOutFileName) {
    try {
        pool.startThreads();
    } catch (Exception e) {
        logger.error(THREAD_POOL_START_ERR, e);
        return false;
    }
    synchronized (jobQueue) {
        jobCount = scripts.size();
    }
    for (int i = 0; i < scripts.size(); i++) {
        URI script = scripts.get(i);
        String cleanParam = params.get(i);
        if (script == null) {
            continue;
        }
        if (debug) {
            logger.debug("Clean call: " + script + " " + cleanParam);
        }
        try {
            if (!node.isUserNeeded() && script.getUserInfo() != null) {
                // Remove user from the URI
                script.setUserInfo(null);
            }
            String user = script.getUserInfo();
            if (user == null) {
                user = "";
            } else {
                user += "@";
            }
            SoftwareDescription sd = new SoftwareDescription();
            sd.addAttribute("uri", Protocol.ANY_URI.getSchema() + user + script.getHost());
            sd.setExecutable(script.getPath());
            sd.setArguments(cleanParam.split(" "));
            sd.addAttribute("job_number", i);
            sd.addAttribute(SoftwareDescription.SANDBOX_ROOT, File.separator + "tmp" + File.separator);
            sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
            sd.addAttribute(SoftwareDescription.SANDBOX_DELETE, "false");
            if (debug) {
                try {
                    org.gridlab.gat.io.File outFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out");
                    sd.setStdout(outFile);
                    org.gridlab.gat.io.File errFile = GAT.createFile(node.getContext(), Protocol.ANY_URI.getSchema() + File.separator + System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err");
                    sd.setStderr(errFile);
                } catch (Exception e) {
                    logger.error(CLEAN_JOB_ERR, e);
                }
            }
            sdQueue.enqueue(sd);
        } catch (Exception e) {
            logger.error(CLEAN_JOB_ERR, e);
            return false;
        }
    }
    Long timeout = System.currentTimeMillis() + 60_000l;
    // Poll for completion of the clean jobs
    while (jobCount > 0 && System.currentTimeMillis() < timeout) {
        Job job = jobQueue.dequeue();
        if (job == null) {
            synchronized (jobQueue) {
                jobCount--;
            }
        } else if (job.getState() == JobState.STOPPED) {
            synchronized (jobQueue) {
                jobCount--;
            }
        } else if (job.getState() == JobState.SUBMISSION_ERROR) {
            logger.error(CLEAN_JOB_ERR + ": " + job);
            synchronized (jobQueue) {
                jobCount--;
            }
        } else {
            jobQueue.enqueue(job);
            try {
                Thread.sleep(50);
            } catch (Exception e) {
            }
        }
    }
    try {
        pool.stopThreads();
    } catch (Exception e) {
        logger.error(THREAD_POOL_STOP_ERR, e);
        return false;
    }
    // Move cleanX.out logs to default logger
    if (debug) {
        String stdOutFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".out";
        try (FileReader cleanOut = new FileReader(stdOutFilePath);
            BufferedReader br = new BufferedReader(cleanOut)) {
            String line = br.readLine();
            while (line != null) {
                logger.debug(line);
                line = br.readLine();
            }
        } catch (Exception e) {
            logger.error("Error moving std out file", e);
        }
        // Delete file
        if (!new File(stdOutFilePath).delete()) {
            logger.error("Error deleting out file " + stdOutFilePath);
        }
    }
    // Move cleanX.err logs to default logger
    if (debug) {
        String stdErrFilePath = System.getProperty(COMPSsConstants.APP_LOG_DIR) + File.separator + stdOutFileName + ".err";
        try (FileReader cleanErr = new FileReader(stdErrFilePath);
            BufferedReader br = new BufferedReader(cleanErr)) {
            String line = br.readLine();
            while (line != null) {
                logger.error(line);
                line = br.readLine();
            }
        } catch (Exception e) {
            logger.error("Error moving std err file", e);
        }
        if (!new File(stdErrFilePath).delete()) {
            logger.error("Error deleting err file " + stdErrFilePath);
        }
    }
    return true;
}
Also used : URI(org.gridlab.gat.URI) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) Job(org.gridlab.gat.resources.Job) File(java.io.File)

Example 3 with SoftwareDescription

use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.

the class LsfResourceBrokerAdaptor method submitJob.

/*
     * (non-Javadoc)
     * 
     * @see org.gridlab.gat.resources.ResourceBroker#submitJob(org.gridlab.gat.resources.JobDescription)
     */
public Job submitJob(AbstractJobDescription abstractDescription, MetricListener listener, String metricDefinitionName) throws GATInvocationException {
    if (!(abstractDescription instanceof JobDescription)) {
        throw new GATInvocationException("can only handle JobDescriptions: " + abstractDescription.getClass());
    }
    JobDescription description = (JobDescription) abstractDescription;
    SoftwareDescription sd = description.getSoftwareDescription();
    if (sd == null) {
        throw new GATInvocationException("The job description does not contain a software description");
    }
    if (description.getProcessCount() < 1) {
        throw new GATInvocationException("Adaptor cannot handle: process count < 1: " + description.getProcessCount());
    }
    if (description.getResourceCount() != 1) {
        throw new GATInvocationException("Adaptor cannot handle: resource count > 1: " + description.getResourceCount());
    }
    String home = System.getProperty("user.home");
    if (home == null) {
        throw new GATInvocationException("lsf broker could not get user home dir");
    }
    Sandbox sandbox = new Sandbox(gatContext, description, "localhost", home, true, true, false, false);
    LsfJob lsfJob = new LsfJob(gatContext, description, sandbox);
    Job job = null;
    if (description instanceof WrapperJobDescription) {
        WrapperJobCpi tmp = new WrapperJobCpi(gatContext, lsfJob, listener, metricDefinitionName);
        listener = tmp;
        job = tmp;
    } else {
        job = lsfJob;
    }
    if (listener != null && metricDefinitionName != null) {
        Metric metric = lsfJob.getMetricDefinitionByName(metricDefinitionName).createMetric(null);
        lsfJob.addMetricListener(listener, metric);
    }
    lsfJob.setState(Job.JobState.PRE_STAGING);
    lsfJob.waitForTrigger(Job.JobState.PRE_STAGING);
    sandbox.prestage();
    String exe;
    if (sandbox.getResolvedExecutable() != null) {
        exe = sandbox.getResolvedExecutable().getPath();
    // try to set the executable bit, it might be lost
    /* CDIAZ: The command "exe" can be also in a remote host
             * 		  The command must have the right permissions in the remote host
            try {
                new CommandRunner("chmod", "+x", exe);
            } catch (Throwable t) {
                // ignore
            }
            */
    } else {
        exe = getExecutable(description);
    }
    String[] args = getArgumentsArray(description);
    // Directory where the lsf command will be executed
    java.io.File f = new java.io.File(sandbox.getSandboxPath());
    if (!f.exists()) {
        throw new GATInvocationException("Unable to find directory " + f.getAbsolutePath());
    }
    // Check and set the environment for a blaunch command
    Map<String, Object> env = sd.getEnvironment();
    this.prepareBLaunchEnv(env);
    // Encapsulate the original command into a blaunch command
    String host = brokerURI.getHost();
    String blExe = this.getBlaunchCommand();
    String[] blArgs = this.getBlaunchArgs(host, exe, args);
    ProcessBundle bundle = new ProcessBundle(description.getProcessCount(), blExe, blArgs, f, env);
    lsfJob.setSubmissionTime();
    lsfJob.setState(Job.JobState.SCHEDULED);
    try {
        lsfJob.setState(Job.JobState.RUNNING);
        lsfJob.waitForTrigger(Job.JobState.RUNNING);
        lsfJob.setStartTime();
        bundle.startBundle();
        lsfJob.setProcess(bundle);
    } catch (IOException e) {
        throw new CommandNotFoundException("LsfResourceBrokerAdaptor", e);
    }
    if (!sd.streamingStderrEnabled()) {
        try {
            if (sd.getStderr() != null) {
                OutputStream err = GAT.createFileOutputStream(gatContext, sd.getStderr());
                // to file
                StreamForwarder forwarder = new StreamForwarder(bundle.getStderr(), err, sd.getExecutable() + " [stderr]");
                lsfJob.setErrorStream(forwarder);
                if (logger.isDebugEnabled()) {
                    logger.debug("Created stderr forwarder to file " + sd.getStderr());
                }
            } else {
                // or throw it away
                new StreamForwarder(bundle.getStderr(), null, sd.getExecutable() + " [stderr]");
            }
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file output stream for stderr!", e);
        }
    }
    if (!sd.streamingStdoutEnabled()) {
        // read away the stdout
        try {
            if (sd.getStdout() != null) {
                // to file
                OutputStream out = GAT.createFileOutputStream(gatContext, sd.getStdout());
                StreamForwarder forwarder = new StreamForwarder(bundle.getStdout(), out, sd.getExecutable() + " [stdout]");
                lsfJob.setOutputStream(forwarder);
                if (logger.isDebugEnabled()) {
                    logger.debug("Created stdout forwarder to file " + sd.getStdout());
                }
            } else {
                // or throw it away
                new StreamForwarder(bundle.getStdout(), null, sd.getExecutable() + " [stdout]");
            }
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file output stream for stdout!", e);
        }
    }
    if (!sd.streamingStdinEnabled() && sd.getStdin() != null) {
        // forward the stdin from file
        try {
            InputStream in = GAT.createFileInputStream(gatContext, sd.getStdin());
            bundle.setStdin(sd.getExecutable(), in);
        } catch (GATObjectCreationException e) {
            throw new GATInvocationException("Unable to create file input stream for stdin!", e);
        }
    }
    lsfJob.monitorState();
    return job;
}
Also used : ProcessBundle(org.gridlab.gat.engine.util.ProcessBundle) GATObjectCreationException(org.gridlab.gat.GATObjectCreationException) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) StreamForwarder(org.gridlab.gat.engine.util.StreamForwarder) IOException(java.io.IOException) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) Sandbox(org.gridlab.gat.resources.cpi.Sandbox) WrapperJobDescription(org.gridlab.gat.resources.WrapperJobDescription) JobDescription(org.gridlab.gat.resources.JobDescription) AbstractJobDescription(org.gridlab.gat.resources.AbstractJobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) WrapperJobCpi(org.gridlab.gat.resources.cpi.WrapperJobCpi) Metric(org.gridlab.gat.monitoring.Metric) Job(org.gridlab.gat.resources.Job) WrapperJobDescription(org.gridlab.gat.resources.WrapperJobDescription) CommandNotFoundException(org.gridlab.gat.CommandNotFoundException)

Example 4 with SoftwareDescription

use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.

the class SshLSFJob method getJobState.

protected void getJobState(String jobID) throws GATInvocationException {
    synchronized (this) {
        while (jobStateBusy) {
            try {
                wait();
            } catch (InterruptedException e) {
            // ignored
            }
        }
        jobStateBusy = true;
    }
    JobState resultState;
    try {
        if (state == JobState.POST_STAGING || state == JobState.STOPPED || state == JobState.SUBMISSION_ERROR) {
            return;
        }
        logger.debug("Getting task status in setState()");
        // getting the status via ssh ... squeue
        java.io.File squeueResultFile = null;
        try {
            // Create qstat job
            SoftwareDescription sd = new SoftwareDescription();
            // Use /bin/sh, so that $USER gets expanded.
            sd.setExecutable("/bin/sh");
            sd.setArguments("-c", "bjobs -noheader " + jobID + " | awk {' print $3 '}");
            sd.addAttribute(SoftwareDescription.SANDBOX_USEROOT, "true");
            squeueResultFile = java.io.File.createTempFile("GAT", "tmp");
            try {
                sd.setStdout(GAT.createFile(subContext, new URI("file:///" + squeueResultFile.getAbsolutePath().replace(File.separatorChar, '/'))));
            } catch (Throwable e1) {
                throw new GATInvocationException("Could not create GAT object for temporary " + squeueResultFile.getAbsolutePath(), e1);
            }
            JobDescription jd = new JobDescription(sd);
            Job job = jobHelper.submitJob(jd, this, "job.status");
            synchronized (job) {
                while (job.getState() != Job.JobState.STOPPED && job.getState() != Job.JobState.SUBMISSION_ERROR) {
                    try {
                        job.wait();
                    } catch (InterruptedException e) {
                    // ignore
                    }
                }
            }
            if (job.getState() != Job.JobState.STOPPED || job.getExitStatus() != 0) {
                throw new GATInvocationException("Could not submit squeue job " + sd.toString());
            }
            // submit success.
            BufferedReader in = new BufferedReader(new FileReader(squeueResultFile.getAbsolutePath()));
            String status = in.readLine();
            // or finished. Set to "" in this case. --Ceriel
            if (status == null) {
                status = "";
            }
            if (logger.isDebugEnabled()) {
                logger.debug("squeue line: " + status);
            }
            resultState = mapLSFStatetoGAT(status);
        } catch (IOException e) {
            logger.debug("retrieving job status sshslurmjob failed", e);
            throw new GATInvocationException("Unable to retrieve the Job Status", e);
        } finally {
            squeueResultFile.delete();
        }
    } finally {
        synchronized (this) {
            jobStateBusy = false;
            notifyAll();
        }
    }
    if (resultState != JobState.STOPPED) {
        setState(resultState);
    } else {
        setState(JobState.POST_STAGING);
    }
}
Also used : IOException(java.io.IOException) URI(org.gridlab.gat.URI) File(java.io.File) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) JobDescription(org.gridlab.gat.resources.JobDescription) GATInvocationException(org.gridlab.gat.GATInvocationException) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SshLSFJob(org.gridlab.gat.resources.cpi.sshlsf.SshLSFJob) Job(org.gridlab.gat.resources.Job)

Example 5 with SoftwareDescription

use of org.gridlab.gat.resources.SoftwareDescription in project compss by bsc-wdc.

the class SshLsfResourceBrokerAdaptor method createBsubScript.

private java.io.File createBsubScript(JobDescription description, String returnValueFile, int nproc) throws GATInvocationException {
    // Adding bsub options
    String Queue = null;
    long Time = -1;
    Integer cpus = null;
    String jobname = null;
    java.io.File temp;
    LSFScriptWriter job = null;
    HashMap<String, Object> rd_HashMap = null;
    SoftwareDescription sd = description.getSoftwareDescription();
    ResourceDescription rd = description.getResourceDescription();
    // Corrected initialization of rd_HashMap: rd may be null ... --Ceriel
    if (rd != null) {
        rd_HashMap = (HashMap<String, Object>) rd.getDescription();
    }
    if (rd_HashMap == null) {
        rd_HashMap = new HashMap<String, Object>();
    }
    // try {
    temp = new java.io.File("lsf" + Math.random());
    try {
        job = new LSFScriptWriter(new BufferedWriter(new FileWriter(temp)));
        String userScript = (String) gatContext.getPreferences().get(SSHLSF_SCRIPT);
        if (userScript != null) {
            // a specified job script overrides everything, except for
            // pre-staging, post-staging,
            // and exit status.
            BufferedReader f = new BufferedReader(new FileReader(userScript));
            for (; ; ) {
                String s = f.readLine();
                if (s == null) {
                    break;
                }
                job.print(s + "\n");
            }
        } else {
            job.print("#!/bin/sh\n");
            job.print("# bsub script automatically generated by GAT SshLsf adaptor\n");
            // Resources: queue, walltime, memory size, et cetera.
            Queue = (String) rd_HashMap.get("machine.queue");
            if (Queue == null) {
                Queue = sd.getStringAttribute(SoftwareDescription.JOB_QUEUE, null);
            }
            if (Queue != null) {
                job.addOption("q", Queue);
            }
            Time = sd.getLongAttribute(SoftwareDescription.WALLTIME_MAX, -1L);
            cpus = (Integer) rd_HashMap.get(HardwareResourceDescription.CPU_COUNT);
            if (cpus == null) {
                cpus = sd.getIntAttribute("coreCount", 1);
            }
            job.addOption("n", cpus);
            // In a single node
            job.addOption("R", "\"span[ptile=" + cpus + "]\"");
            if (Time > 0) {
                // Reformat time.
                int minutes = (int) (Time % 60);
                job.addOption("W", minutes);
            } else {
                job.addOption("W", 60);
            }
            String nativeFlags = null;
            Object o = rd == null ? null : rd.getResourceAttribute(SSHLSF_NATIVE_FLAGS);
            if (o != null && o instanceof String) {
                nativeFlags = (String) o;
            } else {
                String s = sd == null ? null : sd.getStringAttribute(SSHLSF_NATIVE_FLAGS, null);
                if (s != null) {
                    nativeFlags = s;
                } else {
                    o = gatContext.getPreferences().get(SSHLSF_NATIVE_FLAGS);
                    if (o != null && o instanceof String) {
                        nativeFlags = (String) o;
                    }
                }
            }
            if (nativeFlags != null) {
                String[] splits = nativeFlags.split("##");
                for (String s : splits) {
                    job.addString(s);
                }
            }
            String path = sd.getStringAttribute(SoftwareDescription.SANDBOX_ROOT, "");
            if (!path.isEmpty() && !path.endsWith(File.separator)) {
                path = path + File.separator;
            }
            // Set working dir.
            // job.addOption("cwd", path);
            // Name for the job.
            jobname = (String) rd_HashMap.get("Jobname");
            if (jobname == null) {
                jobname = brokerURI.getUserInfo();
                if (jobname == null || "".equals(jobname)) {
                    jobname = "compss_remotejob_" + System.getProperty("user.name");
                }
            }
            if (jobname != null)
                job.addOption("J", jobname);
            if (sd.getStdout() != null) {
                job.addOption("oo", path + sd.getStdout().getName());
            }
            if (sd.getStderr() != null) {
                job.addOption("eo", path + sd.getStderr().getName());
            }
            addScriptExecution(job, sd, rd);
        }
        job.print("echo retvalue = $? > " + returnValueFile + "\n");
    } catch (Throwable e) {
        throw new GATInvocationException("Cannot create temporary bsub file " + temp.getAbsolutePath(), e);
    } finally {
        if (job != null)
            job.close();
    }
    return temp;
}
Also used : FileWriter(java.io.FileWriter) File(java.io.File) SoftwareDescription(org.gridlab.gat.resources.SoftwareDescription) BufferedWriter(java.io.BufferedWriter) GATInvocationException(org.gridlab.gat.GATInvocationException) HardwareResourceDescription(org.gridlab.gat.resources.HardwareResourceDescription) ResourceDescription(org.gridlab.gat.resources.ResourceDescription) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader)

Aggregations

SoftwareDescription (org.gridlab.gat.resources.SoftwareDescription)34 JobDescription (org.gridlab.gat.resources.JobDescription)31 URI (org.gridlab.gat.URI)27 GATInvocationException (org.gridlab.gat.GATInvocationException)24 Job (org.gridlab.gat.resources.Job)24 ResourceBroker (org.gridlab.gat.resources.ResourceBroker)24 GATObjectCreationException (org.gridlab.gat.GATObjectCreationException)17 URISyntaxException (java.net.URISyntaxException)16 IOException (java.io.IOException)11 BufferedReader (java.io.BufferedReader)8 Preferences (org.gridlab.gat.Preferences)8 File (org.gridlab.gat.io.File)8 File (java.io.File)6 WrapperJobDescription (org.gridlab.gat.resources.WrapperJobDescription)5 FileReader (java.io.FileReader)4 InputStreamReader (java.io.InputStreamReader)4 AbstractJobDescription (org.gridlab.gat.resources.AbstractJobDescription)4 InputStream (java.io.InputStream)3 HashMap (java.util.HashMap)3 GATContext (org.gridlab.gat.GATContext)3