use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.
the class SlurmProxy method getRunningJobs.
@Override
public Map<HtcJobInfo, HtcJobStatus> getRunningJobs() throws ExecutableException, IOException {
final String JOB_CMD_SQUEUE = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_squeue, "squeue");
// squeue -p vcell2 -O jobid:25,name:25,state:13
String partition = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition);
String[] cmds = { JOB_CMD_SQUEUE, "-p", partition, "-O", "jobid:25,name:25,state:13,batchhost" };
CommandOutput commandOutput = commandService.command(cmds);
String output = commandOutput.getStandardOutput();
Map<HtcJobInfo, HtcJobStatus> statusMap = extractJobIdsFromSqueue(output);
return statusMap;
}
use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.
the class SlurmProxy method getPartitionNodeListCSV.
private String getPartitionNodeListCSV() throws HtcException, ExecutableException, IOException {
final String JOB_CMD_SINFO = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_sinfo, "sinfo");
//
// nodelist=$(sinfo -N -h -p vcell2 --Format='nodelist' | xargs | tr ' ' ',')
//
String partition = PropertyLoader.getRequiredProperty(PropertyLoader.slurm_partition);
String[] cmds = { JOB_CMD_SINFO, "-N", "-h", "-p", partition, "--Format='nodelist'", "|", "xargs", "|", "tr", "' '", "','" };
CommandOutput commandOutput = commandService.command(cmds);
String output = commandOutput.getStandardOutput().trim();
output = output.replace("\n", "");
return output;
}
use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.
the class SlurmProxy method creatCommandService.
public static HtcProxy creatCommandService(java.lang.String[] sshHostUserKeyfile) throws IOException {
CommandService commandService = null;
if (sshHostUserKeyfile != null && sshHostUserKeyfile.length == 3) {
ArrayList<String> htcDispatchHostNames = new ArrayList<String>();
StringTokenizer st = new StringTokenizer(sshHostUserKeyfile[0], ", ");
while (st.hasMoreElements()) {
htcDispatchHostNames.add(st.nextToken());
}
// String sshHost = sshHostUserKeyfile[0];
String sshUser = sshHostUserKeyfile[1];
File sshKeyFile = new File(sshHostUserKeyfile[2]);
try {
commandService = new CommandServiceSshNative(htcDispatchHostNames.toArray(new String[0]), sshUser, sshKeyFile);
commandService.command(new String[] { "/usr/bin/env bash -c ls | head -5" });
// lg.trace("SSH Connection test passed with installed keyfile, running ls as user "+sshUser+" on "+sshHost);
} catch (Exception e) {
e.printStackTrace();
try {
commandService = new CommandServiceSshNative(htcDispatchHostNames.toArray(new String[0]), sshUser, sshKeyFile, new File("/root"));
CommandOutput commandOutput = commandService.command(new String[] { "/usr/bin/env bash -c ls | head -5" });
// lg.trace("SSH Connection test passed after installing keyfile, running ls as user "+sshUser+" on "+sshHost);
} catch (Exception e2) {
e.printStackTrace();
throw new RuntimeException("failed to establish an ssh command connection to " + sshHostUserKeyfile[0] + " as user '" + sshUser + "' using key '" + sshKeyFile + "'", e);
}
}
// can't make user directories, they are remote.
AbstractSolver.bMakeUserDirs = false;
} else {
commandService = new CommandServiceLocal();
}
BatchSystemType batchSystemType = BatchSystemType.SLURM;
HtcProxy htcProxy = null;
switch(batchSystemType) {
case SLURM:
{
htcProxy = new SlurmProxy(commandService, PropertyLoader.getRequiredProperty(PropertyLoader.htcUser));
break;
}
default:
{
throw new RuntimeException("unrecognized batch scheduling option :" + batchSystemType);
}
}
return htcProxy;
}
use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.
the class HtcSimulationWorker method startJobMonitor.
public void startJobMonitor() {
monitorTheseJobs = getMonitorJobs();
try {
System.out.println("----------Resetting slurm monitorJobsFile");
// clean jobs file
StringBuffer sb = new StringBuffer();
for (Iterator<String> iterator = monitorTheseJobs.keySet().iterator(); iterator.hasNext(); ) {
sb.append((iterator.next()) + "\n");
}
Files.write(monitorJobsFile.toPath(), sb.toString().getBytes(), StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
} catch (IOException e1) {
e1.printStackTrace();
}
monitorJobsThread = new Thread(new Runnable() {
@Override
public void run() {
while (true) {
try {
int sleeptime = 60000;
Thread.sleep(sleeptime);
StringBuffer slurmJobidSB = new StringBuffer();
for (String jobid : monitorTheseJobs.keySet()) {
slurmJobidSB.append((slurmJobidSB.length() > 0 ? "," : "") + jobid);
}
if (slurmJobidSB.length() == 0) {
continue;
}
StringBuffer slurmJobInfoSB = new StringBuffer();
try {
HtcSimulationWorker.this.htcProxy.getCommandService();
String[] tryStr = new String[] { "sacct", "--format=jobid%25,jobname%40,state%30 -n -j " + slurmJobidSB.toString() + " | grep -v \".batch\"" + " | grep -v \".extern\"" };
CommandOutput commandOutput = htcProxy.getCommandService().command(tryStr);
slurmJobInfoSB.append(commandOutput.getStandardOutput());
// System.out.println("-----sacct stdoutput:\n"+commandOutput.getStandardOutput());
// System.out.println("-----sacct stderror:\n"+commandOutput.getStandardError());
} catch (Exception e) {
e.printStackTrace();
}
// Process p = null;
// try{
// String[] cmd = new String[] {"ssh","-i","/run/secrets/batchuserkeyfile","vcell@172.16.246.118","sacct --format=jobid,jobname%40,state -n -j "+slurmJobidSB.toString()+"| grep -v \".batch\""};
// ProcessBuilder pb = new ProcessBuilder(Arrays.asList(cmd));
// pb.redirectErrorStream(true);
// p = pb.start();
// int ioByte = -1;
// while((ioByte = p.getInputStream().read()) != -1) {
// sb.append((char)ioByte);
// }
// p.waitFor();
// }catch(Exception e) {
// e.printStackTrace();
// continue;
// }
// System.out.println("-----");
// System.out.println("-----"+sb.toString());
// System.out.println("-----");
StringTokenizer st = new StringTokenizer(slurmJobInfoSB.toString(), " \n\r\t");
while (st.hasMoreTokens()) {
String slurmJobID = st.nextToken();
String jobName = st.nextToken();
String jobState = st.nextToken();
if (jobState.equalsIgnoreCase("FAILED") || jobState.startsWith("CANCELLED") || jobState.equalsIgnoreCase("BOOT_FAIL") || jobState.equalsIgnoreCase("DEADLINE") || jobState.equalsIgnoreCase("NODE_FAIL") || jobState.equalsIgnoreCase("OUT_OF_MEMORY") || jobState.equalsIgnoreCase("PREEMPTED") || jobState.equalsIgnoreCase("TIMEOUT")) {
MonitorJobInfo failedMonitorJobInfo = monitorTheseJobs.get(slurmJobID);
WorkerEventMessage.sendWorkerExitError(messageProducer_sim, HtcSimulationWorker.class.getName(), ManageUtils.getHostName(), failedMonitorJobInfo.vcsimID, failedMonitorJobInfo.jobIndex, failedMonitorJobInfo.taskID, SimulationMessage.jobFailed("Fail found by monitor, slrmJobID=" + slurmJobID + " jobName=" + jobName + " jobState=" + jobState));
removeMonitorJob(Long.parseLong(slurmJobID));
} else if (jobState.equalsIgnoreCase("COMPLETED")) {
MonitorJobInfo completedMonitorJobInfo = monitorTheseJobs.get(slurmJobID);
WorkerEventMessage.sendAlternateCompleted(messageProducer_sim, HtcSimulationWorker.class.getName(), completedMonitorJobInfo.vcsimID, ManageUtils.getHostName(), completedMonitorJobInfo.jobIndex, completedMonitorJobInfo.taskID);
removeMonitorJob(Long.parseLong(slurmJobID));
}
}
// BF BOOT_FAIL
// Job terminated due to launch failure, typically due to a hardware failure (e.g. unable to boot the node or block and the job can not be requeued).
// CA CANCELLED
// Job was explicitly cancelled by the user or system administrator. The job may or may not have been initiated.
// CD COMPLETED
// Job has terminated all processes on all nodes with an exit code of zero.
// DL DEADLINE
// Job terminated on deadline.
// F FAILED
// Job terminated with non-zero exit code or other failure condition.
// NF NODE_FAIL
// Job terminated due to failure of one or more allocated nodes.
// OOM OUT_OF_MEMORY
// Job experienced out of memory error.
// PD PENDING
// Job is awaiting resource allocation.
// PR PREEMPTED
// Job terminated due to preemption.
// R RUNNING
// Job currently has an allocation.
// RQ REQUEUED
// Job was requeued.
// RS RESIZING
// Job is about to change size.
// RV REVOKED
// Sibling was removed from cluster due to other cluster starting the job.
// S SUSPENDED
// Job has an allocation, but execution has been suspended and CPUs have been released for other jobs.
// TO TIMEOUT
// Job terminated upon reaching its time limit.
// CommandOutput commandOutput = htcProxy.getCommandService().command(cmd);
// System.out.println("-----sacct stdoutput:\n"+commandOutput.getStandardOutput());
// System.out.println("-----sacct stderror:\n"+commandOutput.getStandardError());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
});
monitorJobsThread.setDaemon(true);
monitorJobsThread.start();
}
use of cbit.vcell.message.server.cmd.CommandService.CommandOutput in project vcell by virtualcell.
the class SlurmProxy method getJobStatus.
public Map<HtcJobInfo, HtcJobStatus> getJobStatus(List<HtcJobInfo> requestedHtcJobInfos) throws ExecutableException, IOException {
if (requestedHtcJobInfos.size() == 0) {
throw new RuntimeException("htcJobList is empty");
}
final String JOB_CMD_SACCT = PropertyLoader.getProperty(PropertyLoader.slurm_cmd_sacct, "sacct");
ArrayList<String> jobNumbers = new ArrayList<String>();
for (HtcJobInfo jobInfo : requestedHtcJobInfos) {
jobNumbers.add(Long.toString(jobInfo.getHtcJobID().getJobNumber()));
}
String jobList = String.join(",", jobNumbers);
String[] cmds = { JOB_CMD_SACCT, "-P", "-j", jobList, "-o", "jobid%25,jobname%25,state%13" };
CommandOutput commandOutput = commandService.command(cmds);
String output = commandOutput.getStandardOutput();
Map<HtcJobInfo, HtcJobStatus> statusMap = extractJobIds(output);
//
for (HtcJobInfo parsedHtcJobInfo : statusMap.keySet()) {
if (!requestedHtcJobInfos.contains(parsedHtcJobInfo)) {
statusMap.remove(parsedHtcJobInfo);
}
}
return statusMap;
}
Aggregations