use of pipelite.exception.PipeliteTimeoutException in project pipelite by enasequence.
the class AbstractLsfExecutor method recoverJobs.
/**
* Attempt to recover missing job results. This will only ever be done once and if it fails the
* job is considered failed.
*/
private static void recoverJobs(CmdRunner cmdRunner, Map<String, LsfDescribeJobsCache.RequestContext> requestMap, List<JobResult> jobResults) {
log.atInfo().log("Recovering LSF job results.");
AtomicInteger remainingCount = new AtomicInteger();
AtomicInteger attemptedCount = new AtomicInteger();
AtomicInteger recoveredCount = new AtomicInteger();
ZonedDateTime start = ZonedDateTime.now();
ZonedDateTime until = start.plus(JOB_RECOVERY_TIMEOUT);
ExecutorService executorService = Executors.newFixedThreadPool(JOB_RECOVERY_PARALLELISM);
try {
jobResults.stream().filter(r -> r.jobId != null && r.result == null).forEach(r -> {
attemptedCount.incrementAndGet();
remainingCount.incrementAndGet();
executorService.submit(() -> {
try {
// Attempt to recover missing job result using bhist.
if (recoverJobUsingBhist(cmdRunner, r)) {
recoveredCount.incrementAndGet();
} else {
// Attempt to recover missing job result using output file.
if (recoverJobUsingOutFile(cmdRunner, r, requestMap)) {
recoveredCount.incrementAndGet();
}
}
} finally {
remainingCount.decrementAndGet();
}
});
});
try {
while (remainingCount.get() > 0) {
Time.waitUntil(JOB_RECOVERY_POLL_FREQUENCY, until);
}
} catch (PipeliteTimeoutException ex) {
log.atWarning().log("LSF job recovery timeout exceeded.");
}
} finally {
executorService.shutdownNow();
}
log.atInfo().log("Finished recovering LSF job results in " + (Duration.between(ZonedDateTime.now(), start).abs().toMillis() / 1000) + " seconds. Recovered " + recoveredCount.get() + " out of " + attemptedCount.get() + " jobs.");
}
Aggregations