Search in sources :

Example 11 with CommandOutput

use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.

the class SlurmProxy method getRunningJobs.

@Override
public Map<HtcJobInfo, HtcJobStatus> getRunningJobs() throws ExecutableException, IOException {
    final String JOB_CMD_SQUEUE = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_squeue, "squeue");
    // squeue -p vcell2 -O jobid:25,name:25,state:13
    String partition = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition);
    String[] cmds = { JOB_CMD_SQUEUE, "-p", partition, "-O", "jobid:25,name:25,state:13,batchhost" };
    CommandOutput commandOutput = commandService.command(cmds);
    String output = commandOutput.getStandardOutput();
    Map<HtcJobInfo, HtcJobStatus> statusMap = extractJobIdsFromSqueue(output);
    return statusMap;
}
Also used : HtcJobStatus(cbit.vcell.message.server.htc.HtcJobStatus) CommandOutput(cbit.vcell.message.server.cmd.CommandService.CommandOutput)

Example 12 with CommandOutput

use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.

the class SlurmProxy method getPartitionNodeListCSV.

private String getPartitionNodeListCSV() throws HtcException, ExecutableException, IOException {
    final String JOB_CMD_SINFO = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_sinfo, "sinfo");
    // 
    // nodelist=$(sinfo -N -h -p vcell2 --Format='nodelist' | xargs | tr ' ' ',')
    // 
    String partition = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition);
    String[] cmds = { JOB_CMD_SINFO, "-N", "-h", "-p", partition, "--Format='nodelist'", "|", "xargs", "|", "tr", "' '", "','" };
    CommandOutput commandOutput = commandService.command(cmds);
    String output = commandOutput.getStandardOutput().trim();
    output = output.replace("\n", "");
    return output;
}
Also used : CommandOutput(cbit.vcell.message.server.cmd.CommandService.CommandOutput)

Example 13 with CommandOutput

use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.

the class SlurmProxy method creatCommandService.

public static HtcProxy creatCommandService(java.lang.String[] sshHostUserKeyfile) throws IOException {
    CommandService commandService = null;
    if (sshHostUserKeyfile != null && sshHostUserKeyfile.length == 3) {
        ArrayList<String> htcDispatchHostNames = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(sshHostUserKeyfile[0], ", ");
        while (st.hasMoreElements()) {
            htcDispatchHostNames.add(st.nextToken());
        }
        // String sshHost = sshHostUserKeyfile[0];
        String sshUser = sshHostUserKeyfile[1];
        File sshKeyFile = new File(sshHostUserKeyfile[2]);
        try {
            commandService = new CommandServiceSshNative(htcDispatchHostNames.toArray(new String[0]), sshUser, sshKeyFile);
            commandService.command(new String[] { "/usr/bin/env bash -c ls | head -5" });
        // lg.trace("SSH Connection test passed with installed keyfile, running ls as user "+sshUser+" on "+sshHost);
        } catch (Exception e) {
            e.printStackTrace();
            try {
                commandService = new CommandServiceSshNative(htcDispatchHostNames.toArray(new String[0]), sshUser, sshKeyFile, new File("/root"));
                CommandOutput commandOutput = commandService.command(new String[] { "/usr/bin/env bash -c ls | head -5" });
            // lg.trace("SSH Connection test passed after installing keyfile, running ls as user "+sshUser+" on "+sshHost);
            } catch (Exception e2) {
                e.printStackTrace();
                throw new RuntimeException("failed to establish an ssh command connection to " + sshHostUserKeyfile[0] + " as user '" + sshUser + "' using key '" + sshKeyFile + "'", e);
            }
        }
        // can't make user directories, they are remote.
        AbstractSolver.bMakeUserDirs = false;
    } else {
        commandService = new CommandServiceLocal();
    }
    BatchSystemType batchSystemType = BatchSystemType.SLURM;
    HtcProxy htcProxy = null;
    switch(batchSystemType) {
        case SLURM:
            {
                htcProxy = new SlurmProxy(commandService, PropertyLoader.getRequiredProperty(PropertyLoader.htcUser));
                break;
            }
        default:
            {
                throw new RuntimeException("unrecognized batch scheduling option :" + batchSystemType);
            }
    }
    return htcProxy;
}
Also used : HtcProxy(cbit.vcell.message.server.htc.HtcProxy) CommandService(cbit.vcell.message.server.cmd.CommandService) CommandOutput(cbit.vcell.message.server.cmd.CommandService.CommandOutput) ArrayList(java.util.ArrayList) ExecutableException(org.vcell.util.exe.ExecutableException) HtcException(cbit.vcell.message.server.htc.HtcException) IOException(java.io.IOException) HtcJobNotFoundException(cbit.vcell.message.server.htc.HtcJobNotFoundException) StringTokenizer(java.util.StringTokenizer) CommandServiceLocal(cbit.vcell.message.server.cmd.CommandServiceLocal) BatchSystemType(cbit.vcell.server.HtcJobID.BatchSystemType) File(java.io.File) CommandServiceSshNative(cbit.vcell.message.server.cmd.CommandServiceSshNative)

Example 14 with CommandOutput

use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.

the class HtcSimulationWorker method startJobMonitor.

public void startJobMonitor() {
    monitorTheseJobs = getMonitorJobs();
    try {
        System.out.println("----------Resetting slurm monitorJobsFile");
        // clean jobs file
        StringBuffer sb = new StringBuffer();
        for (Iterator<String> iterator = monitorTheseJobs.keySet().iterator(); iterator.hasNext(); ) {
            sb.append((iterator.next()) + "\n");
        }
        Files.write(monitorJobsFile.toPath(), sb.toString().getBytes(), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
    } catch (IOException e1) {
        e1.printStackTrace();
    }
    monitorJobsThread = new Thread(new Runnable() {

        @Override
        public void run() {
            while (true) {
                try {
                    int sleeptime = 60000;
                    Thread.sleep(sleeptime);
                    StringBuffer slurmJobidSB = new StringBuffer();
                    for (String jobid : monitorTheseJobs.keySet()) {
                        slurmJobidSB.append((slurmJobidSB.length() > 0 ? "," : "") + jobid);
                    }
                    if (slurmJobidSB.length() == 0) {
                        continue;
                    }
                    StringBuffer slurmJobInfoSB = new StringBuffer();
                    try {
                        HtcSimulationWorker.this.htcProxy.getCommandService();
                        String[] tryStr = new String[] { "sacct", "--format=jobid%25,jobname%40,state%30 -n -j " + slurmJobidSB.toString() + " | grep -v \".batch\"" + " | grep -v \".extern\"" };
                        CommandOutput commandOutput = htcProxy.getCommandService().command(tryStr);
                        slurmJobInfoSB.append(commandOutput.getStandardOutput());
                    // System.out.println("-----sacct stdoutput:\n"+commandOutput.getStandardOutput());
                    // System.out.println("-----sacct stderror:\n"+commandOutput.getStandardError());
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    // Process p = null;
                    // try{
                    // String[] cmd = new String[] {"ssh","-i","/run/secrets/batchuserkeyfile","vcell@172.16.246.118","sacct --format=jobid,jobname%40,state -n -j "+slurmJobidSB.toString()+"| grep -v \".batch\""};
                    // ProcessBuilder pb = new ProcessBuilder(Arrays.asList(cmd));
                    // pb.redirectErrorStream(true);
                    // p = pb.start();
                    // int ioByte = -1;
                    // while((ioByte = p.getInputStream().read()) != -1) {
                    // sb.append((char)ioByte);
                    // }
                    // p.waitFor();
                    // }catch(Exception e) {
                    // e.printStackTrace();
                    // continue;
                    // }
                    // System.out.println("-----");
                    // System.out.println("-----"+sb.toString());
                    // System.out.println("-----");
                    StringTokenizer st = new StringTokenizer(slurmJobInfoSB.toString(), " \n\r\t");
                    while (st.hasMoreTokens()) {
                        String slurmJobID = st.nextToken();
                        String jobName = st.nextToken();
                        String jobState = st.nextToken();
                        if (jobState.equalsIgnoreCase("FAILED") || jobState.startsWith("CANCELLED") || jobState.equalsIgnoreCase("BOOT_FAIL") || jobState.equalsIgnoreCase("DEADLINE") || jobState.equalsIgnoreCase("NODE_FAIL") || jobState.equalsIgnoreCase("OUT_OF_MEMORY") || jobState.equalsIgnoreCase("PREEMPTED") || jobState.equalsIgnoreCase("TIMEOUT")) {
                            MonitorJobInfo failedMonitorJobInfo = monitorTheseJobs.get(slurmJobID);
                            WorkerEventMessage.sendWorkerExitError(messageProducer_sim, HtcSimulationWorker.class.getName(), ManageUtils.getHostName(), failedMonitorJobInfo.vcsimID, failedMonitorJobInfo.jobIndex, failedMonitorJobInfo.taskID, SimulationMessage.jobFailed("Fail found by monitor, slrmJobID=" + slurmJobID + " jobName=" + jobName + " jobState=" + jobState));
                            removeMonitorJob(Long.parseLong(slurmJobID));
                        } else if (jobState.equalsIgnoreCase("COMPLETED")) {
                            MonitorJobInfo completedMonitorJobInfo = monitorTheseJobs.get(slurmJobID);
                            WorkerEventMessage.sendAlternateCompleted(messageProducer_sim, HtcSimulationWorker.class.getName(), completedMonitorJobInfo.vcsimID, ManageUtils.getHostName(), completedMonitorJobInfo.jobIndex, completedMonitorJobInfo.taskID);
                            removeMonitorJob(Long.parseLong(slurmJobID));
                        }
                    }
                // BF BOOT_FAIL
                // Job terminated due to launch failure, typically due to a hardware failure (e.g. unable to boot the node or block and the job can not be requeued).
                // CA CANCELLED
                // Job was explicitly cancelled by the user or system administrator. The job may or may not have been initiated.
                // CD COMPLETED
                // Job has terminated all processes on all nodes with an exit code of zero.
                // DL DEADLINE
                // Job terminated on deadline.
                // F FAILED
                // Job terminated with non-zero exit code or other failure condition.
                // NF NODE_FAIL
                // Job terminated due to failure of one or more allocated nodes.
                // OOM OUT_OF_MEMORY
                // Job experienced out of memory error.
                // PD PENDING
                // Job is awaiting resource allocation.
                // PR PREEMPTED
                // Job terminated due to preemption.
                // R RUNNING
                // Job currently has an allocation.
                // RQ REQUEUED
                // Job was requeued.
                // RS RESIZING
                // Job is about to change size.
                // RV REVOKED
                // Sibling was removed from cluster due to other cluster starting the job.
                // S SUSPENDED
                // Job has an allocation, but execution has been suspended and CPUs have been released for other jobs.
                // TO TIMEOUT
                // Job terminated upon reaching its time limit.
                // CommandOutput commandOutput = htcProxy.getCommandService().command(cmd);
                // System.out.println("-----sacct stdoutput:\n"+commandOutput.getStandardOutput());
                // System.out.println("-----sacct stderror:\n"+commandOutput.getStandardError());
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    });
    monitorJobsThread.setDaemon(true);
    monitorJobsThread.start();
}
Also used : CommandOutput(cbit.vcell.message.server.cmd.CommandService.CommandOutput) IOException(java.io.IOException) ExecutableException(org.vcell.util.exe.ExecutableException) XmlParseException(cbit.vcell.xml.XmlParseException) IOException(java.io.IOException) DataAccessException(org.vcell.util.DataAccessException) UnknownHostException(java.net.UnknownHostException) VCMessagingException(cbit.vcell.message.VCMessagingException) FileNotFoundException(java.io.FileNotFoundException) RollbackException(cbit.vcell.message.RollbackException) SolverException(cbit.vcell.solver.SolverException) StringTokenizer(java.util.StringTokenizer)

Example 15 with CommandOutput

use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.

the class SlurmProxy method getJobStatus.

public Map<HtcJobInfo, HtcJobStatus> getJobStatus(List<HtcJobInfo> requestedHtcJobInfos) throws ExecutableException, IOException {
    if (requestedHtcJobInfos.size() == 0) {
        throw new RuntimeException("htcJobList is empty");
    }
    final String JOB_CMD_SACCT = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_sacct, "sacct");
    ArrayList<String> jobNumbers = new ArrayList<String>();
    for (HtcJobInfo jobInfo : requestedHtcJobInfos) {
        jobNumbers.add(Long.toString(jobInfo.getHtcJobID().getJobNumber()));
    }
    String jobList = String.join(",", jobNumbers);
    String[] cmds = { JOB_CMD_SACCT, "-P", "-j", jobList, "-o", "jobid%25,jobname%25,state%13" };
    CommandOutput commandOutput = commandService.command(cmds);
    String output = commandOutput.getStandardOutput();
    Map<HtcJobInfo, HtcJobStatus> statusMap = extractJobIds(output);
    // 
    for (HtcJobInfo parsedHtcJobInfo : statusMap.keySet()) {
        if (!requestedHtcJobInfos.contains(parsedHtcJobInfo)) {
            statusMap.remove(parsedHtcJobInfo);
        }
    }
    return statusMap;
}
Also used : HtcJobStatus(cbit.vcell.message.server.htc.HtcJobStatus) CommandOutput(cbit.vcell.message.server.cmd.CommandService.CommandOutput) ArrayList(java.util.ArrayList)

Aggregations

CommandOutput (cbit.vcell.message.server.cmd.CommandService.CommandOutput)16 ExecutableException (org.vcell.util.exe.ExecutableException)11 IOException (java.io.IOException)7 File (java.io.File)6 HtcJobNotFoundException (cbit.vcell.message.server.htc.HtcJobNotFoundException)4 Test (org.junit.Test)4 HtcJobID (cbit.vcell.server.HtcJobID)3 ArrayList (java.util.ArrayList)3 HtcJobStatus (cbit.vcell.message.server.htc.HtcJobStatus)2 StringTokenizer (java.util.StringTokenizer)2 RollbackException (cbit.vcell.message.RollbackException)1 VCMessagingException (cbit.vcell.message.VCMessagingException)1 CommandService (cbit.vcell.message.server.cmd.CommandService)1 CommandServiceLocal (cbit.vcell.message.server.cmd.CommandServiceLocal)1 CommandServiceSshNative (cbit.vcell.message.server.cmd.CommandServiceSshNative)1 HtcException (cbit.vcell.message.server.htc.HtcException)1 HtcProxy (cbit.vcell.message.server.htc.HtcProxy)1 BatchSystemType (cbit.vcell.server.HtcJobID.BatchSystemType)1 SolverException (cbit.vcell.solver.SolverException)1 XmlParseException (cbit.vcell.xml.XmlParseException)1