Search in sources :

Example 1 with PipeliteTimeoutException

use of pipelite.exception.PipeliteTimeoutException in project pipelite by enasequence.

the class AbstractLsfExecutor method recoverJobs.

/**
 * Attempt to recover missing job results. This will only ever be done once and if it fails the
 * job is considered failed.
 */
private static void recoverJobs(CmdRunner cmdRunner, Map<String, LsfDescribeJobsCache.RequestContext> requestMap, List<JobResult> jobResults) {
    log.atInfo().log("Recovering LSF job results.");
    AtomicInteger remainingCount = new AtomicInteger();
    AtomicInteger attemptedCount = new AtomicInteger();
    AtomicInteger recoveredCount = new AtomicInteger();
    ZonedDateTime start = ZonedDateTime.now();
    ZonedDateTime until = start.plus(JOB_RECOVERY_TIMEOUT);
    ExecutorService executorService = Executors.newFixedThreadPool(JOB_RECOVERY_PARALLELISM);
    try {
        jobResults.stream().filter(r -> r.jobId != null && r.result == null).forEach(r -> {
            attemptedCount.incrementAndGet();
            remainingCount.incrementAndGet();
            executorService.submit(() -> {
                try {
                    // Attempt to recover missing job result using bhist.
                    if (recoverJobUsingBhist(cmdRunner, r)) {
                        recoveredCount.incrementAndGet();
                    } else {
                        // Attempt to recover missing job result using output file.
                        if (recoverJobUsingOutFile(cmdRunner, r, requestMap)) {
                            recoveredCount.incrementAndGet();
                        }
                    }
                } finally {
                    remainingCount.decrementAndGet();
                }
            });
        });
        try {
            while (remainingCount.get() > 0) {
                Time.waitUntil(JOB_RECOVERY_POLL_FREQUENCY, until);
            }
        } catch (PipeliteTimeoutException ex) {
            log.atWarning().log("LSF job recovery timeout exceeded.");
        }
    } finally {
        executorService.shutdownNow();
    }
    log.atInfo().log("Finished recovering LSF job results in " + (Duration.between(ZonedDateTime.now(), start).abs().toMillis() / 1000) + " seconds. Recovered " + recoveredCount.get() + " out of " + attemptedCount.get() + " jobs.");
}
Also used : Setter(lombok.Setter) Getter(lombok.Getter) LsfDescribeJobsCache(pipelite.executor.describe.cache.LsfDescribeJobsCache) ZonedDateTime(java.time.ZonedDateTime) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AbstractLsfExecutorParameters(pipelite.stage.parameters.AbstractLsfExecutorParameters) Matcher(java.util.regex.Matcher) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Time(pipelite.time.Time) Duration(java.time.Duration) Map(java.util.Map) ExecutorService(java.util.concurrent.ExecutorService) LsfFilePathResolver(pipelite.stage.path.LsfFilePathResolver) DescribeJobs(pipelite.executor.describe.DescribeJobs) DescribeJobsCacheService(pipelite.service.DescribeJobsCacheService) StageExecutorResult(pipelite.stage.executor.StageExecutorResult) LogKey(pipelite.log.LogKey) PipeliteException(pipelite.exception.PipeliteException) Collectors(java.util.stream.Collectors) RetryTask(pipelite.executor.task.RetryTask) Executors(java.util.concurrent.Executors) StageExecutorResultAttribute(pipelite.stage.executor.StageExecutorResultAttribute) CmdRunner(pipelite.executor.cmd.CmdRunner) Flogger(lombok.extern.flogger.Flogger) List(java.util.List) StageExecutorRequest(pipelite.stage.executor.StageExecutorRequest) Paths(java.nio.file.Paths) Pattern(java.util.regex.Pattern) PipeliteTimeoutException(pipelite.exception.PipeliteTimeoutException) FluentLogger(com.google.common.flogger.FluentLogger) JsonIgnoreProperties(com.fasterxml.jackson.annotation.JsonIgnoreProperties) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ZonedDateTime(java.time.ZonedDateTime) ExecutorService(java.util.concurrent.ExecutorService) PipeliteTimeoutException(pipelite.exception.PipeliteTimeoutException)

Aggregations

JsonIgnoreProperties (com.fasterxml.jackson.annotation.JsonIgnoreProperties)1 FluentLogger (com.google.common.flogger.FluentLogger)1 Paths (java.nio.file.Paths)1 Duration (java.time.Duration)1 ZonedDateTime (java.time.ZonedDateTime)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Collectors (java.util.stream.Collectors)1 Getter (lombok.Getter)1 Setter (lombok.Setter)1 Flogger (lombok.extern.flogger.Flogger)1 PipeliteException (pipelite.exception.PipeliteException)1 PipeliteTimeoutException (pipelite.exception.PipeliteTimeoutException)1