Search in sources :

Example 1 with CrashLoopInfo

use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.

the class SingularityCrashLoopChecker method checkCooldowns.

public void checkCooldowns() {
    final long start = System.currentTimeMillis();
    // cooldown reserved for fast loop, check crash loops separately
    final List<SingularityRequestWithState> cooldownRequests = Lists.newArrayList(requestManager.getCooldownRequests(false));
    AtomicInteger exitedCooldown = new AtomicInteger(0);
    Map<SingularityDeployKey, Optional<SingularityDeployStatistics>> deployStatsCache = new ConcurrentHashMap<>();
    if (!cooldownRequests.isEmpty()) {
        CompletableFutures.allOf(cooldownRequests.stream().map(cooldownRequest -> CompletableFuture.runAsync(() -> lock.runWithRequestLock(() -> {
            if (checkCooldown(cooldownRequest, deployStatsCache)) {
                exitedCooldown.getAndIncrement();
            }
        }, cooldownRequest.getRequest().getId(), getClass().getSimpleName(), SingularitySchedulerLock.Priority.LOW), cooldownExecutor)).collect(Collectors.toList())).join();
    }
    // Check for crash loops
    for (SingularityRequestWithState request : requestManager.getActiveRequests()) {
        Optional<SingularityRequestDeployState> maybeDeployState = deployManager.getRequestDeployState(request.getRequest().getId());
        if (!maybeDeployState.isPresent() || !maybeDeployState.get().getActiveDeploy().isPresent()) {
            continue;
        }
        // Remove outdated loops on new deploy
        List<CrashLoopInfo> crashLoopHistory = requestManager.getCrashLoopsForRequest(request.getRequest().getId()).stream().filter(l -> {
            if (!l.getDeployId().equals(maybeDeployState.get().getActiveDeploy().get().getDeployId())) {
                requestManager.deleteCrashLoop(l);
                return false;
            }
            return true;
        }).collect(Collectors.toList());
        // Only keep the most recent 10 crash loop infos
        crashLoopHistory.stream().filter(l -> l.getEnd().isPresent()).sorted(Comparator.comparingLong(CrashLoopInfo::getStart).reversed()).skip(10).forEach(requestManager::deleteCrashLoop);
        List<CrashLoopInfo> previouslyActive = crashLoopHistory.stream().filter(l -> !l.getEnd().isPresent()).collect(Collectors.toList());
        Optional<SingularityDeployStatistics> maybeDeployStatistics = deployStatsCache.computeIfAbsent(new SingularityDeployKey(request.getRequest().getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()), i -> deployManager.getDeployStatistics(request.getRequest().getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()));
        if (!maybeDeployStatistics.isPresent()) {
            continue;
        }
        List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(maybeDeployStatistics.get());
        if (!active.isEmpty()) {
            active.forEach(l -> {
                if (previouslyActive.stream().noneMatch(l::matches)) {
                    LOG.info("New crash loop for {}: {}", request.getRequest().getId(), l);
                    requestManager.saveCrashLoop(l);
                }
            });
        }
        if (!previouslyActive.isEmpty()) {
            previouslyActive.forEach(l -> {
                if (active.stream().noneMatch(l::matches)) {
                    LOG.info("Crash loop resolved for {}: {}", request.getRequest().getId(), l);
                    requestManager.saveCrashLoop(new CrashLoopInfo(l.getRequestId(), l.getDeployId(), l.getStart(), Optional.of(System.currentTimeMillis()), l.getType()));
                }
            });
        }
    }
    LOG.info("{} out of {} cooldown requests exited cooldown in {}", exitedCooldown.get(), cooldownRequests.size(), JavaUtils.duration(start));
}
Also used : SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) DeployManager(com.hubspot.singularity.data.DeployManager) Inject(com.google.inject.Inject) RequestManager(com.hubspot.singularity.data.RequestManager) LoggerFactory(org.slf4j.LoggerFactory) CompletableFuture(java.util.concurrent.CompletableFuture) Singleton(javax.inject.Singleton) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics) Lists(com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingularityDeployKey(com.hubspot.singularity.SingularityDeployKey) Map(java.util.Map) ExecutorService(java.util.concurrent.ExecutorService) SingularityConfiguration(com.hubspot.singularity.config.SingularityConfiguration) Logger(org.slf4j.Logger) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CompletableFutures(com.hubspot.singularity.async.CompletableFutures) Collectors(java.util.stream.Collectors) JavaUtils(com.hubspot.mesos.JavaUtils) List(java.util.List) SingularityRequestDeployState(com.hubspot.singularity.SingularityRequestDeployState) SingularityManagedThreadPoolFactory(com.hubspot.singularity.SingularityManagedThreadPoolFactory) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) SingularitySchedulerLock(com.hubspot.singularity.mesos.SingularitySchedulerLock) Optional(java.util.Optional) Comparator(java.util.Comparator) Optional(java.util.Optional) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) SingularityRequestDeployState(com.hubspot.singularity.SingularityRequestDeployState) Optional(java.util.Optional) SingularityDeployKey(com.hubspot.singularity.SingularityDeployKey) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics)

Example 2 with CrashLoopInfo

use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.

the class SingularityCrashLoops method getActiveCrashLoops.

List<CrashLoopInfo> getActiveCrashLoops(SingularityDeployStatistics deployStatistics) {
    List<CrashLoopInfo> active = new ArrayList<>();
    if (deployStatistics.getTaskFailureEvents().isEmpty()) {
        return active;
    }
    Optional<SingularityPendingDeploy> maybePending = deployManager.getPendingDeploy(deployStatistics.getRequestId());
    if (maybePending.isPresent() && maybePending.get().getDeployMarker().getDeployId().equals(deployStatistics.getDeployId())) {
        LOG.debug("Not checking cooldown for pending deploy {} - {}", deployStatistics.getRequestId(), deployStatistics.getDeployId());
        return active;
    }
    Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(deployStatistics.getRequestId());
    if (!maybeRequest.isPresent()) {
        return active;
    }
    long now = System.currentTimeMillis();
    // Check fast failures
    Optional<Long> maybeCooldownStart = cooldownStart(deployStatistics, Optional.empty());
    if (maybeCooldownStart.isPresent()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCooldownStart.get(), Optional.empty(), CrashLoopType.FAST_FAILURE_LOOP));
    }
    /*
     * Startup failure loop
     * a) small count of failures but instance num matches one that is in cleaning state waiting for a replacement
     */
    Map<Integer, Long> taskCleanStartTimes = taskManager.getCleanupTasks().stream().filter(t -> t.getTaskId().getRequestId().equals(deployStatistics.getRequestId()) && t.getTaskId().getDeployId().equals(deployStatistics.getDeployId())).collect(Collectors.toMap(t -> t.getTaskId().getInstanceNo(), SingularityTaskCleanup::getTimestamp, Math::max));
    Map<Integer, List<Long>> recentStartupFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.STARTUP_FAILURE && taskCleanStartTimes.containsKey(e.getInstance())).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
    boolean hasStartupFailure = false;
    for (Map.Entry<Integer, List<Long>> entry : recentStartupFailures.entrySet()) {
        if (taskCleanStartTimes.containsKey(entry.getKey())) {
            if (entry.getValue().stream().filter(t -> t > taskCleanStartTimes.get(entry.getKey())).count() > 2) {
                active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), entry.getValue().stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
                hasStartupFailure = true;
                break;
            }
        }
    }
    /*
     * Startup failure loop
     * b) multiple instances failing healthchecks too many times in X minutes
     */
    if (hasStartupFailure) {
        long startupFailThreshold = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateStartupLoopOverMinutes());
        List<Long> recentStartupFailTimestamps = recentStartupFailures.values().stream().flatMap(List::stream).filter(t -> t > startupFailThreshold).collect(Collectors.toList());
        if (recentStartupFailTimestamps.size() > configuration.getStartupFailureThreshold()) {
            active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), recentStartupFailTimestamps.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
        }
    }
    /*
     * OOM Danger. > X OOMs in Y minutes across all instances
     */
    long thresholdOomTime = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateOomsOverMinutes());
    List<Long> oomFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM && e.getTimestamp() > thresholdOomTime).map(TaskFailureEvent::getTimestamp).collect(Collectors.toList());
    if (oomFailures.size() >= configuration.getOomFailureThreshold()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), oomFailures.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.OOM));
    }
    /*
     * Single instance failure. > X failures with same instance no in X minutes, bucketed to avoid counting fast failure as one of these
     * Multi instance failure. > X% of instances failing within Y minutes
     */
    Map<Integer, List<Long>> recentFailuresByInstance = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM || e.getType() == TaskFailureType.BAD_EXIT_CODE || e.getType() == TaskFailureType.OUT_OF_DISK_SPACE).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
    for (Map.Entry<Integer, List<Long>> entry : recentFailuresByInstance.entrySet()) {
        Optional<Long> maybeCrashStart = getStartForFailuresInBuckets(now, entry.getValue(), TimeUnit.MINUTES.toMillis(configuration.getSingleInstanceFailureBucketSizeMinutes()), configuration.getSingleInstanceFailureBuckets(), configuration.getSingleInstanceFailureThreshold(), configuration.getSingleInstanceMinBucketIndexPercent());
        if (maybeCrashStart.isPresent()) {
            active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCrashStart.get(), Optional.empty(), CrashLoopType.SINGLE_INSTANCE_FAILURE_LOOP));
            break;
        }
    }
    Optional<Long> maybeMultiCrashStart = getStartForFailuresInBuckets(now, recentFailuresByInstance.values().stream().flatMap(List::stream).collect(Collectors.toList()), TimeUnit.MINUTES.toMillis(configuration.getMultiInstanceFailureBucketSizeMinutes()), configuration.getMultiInstanceFailureBuckets(), configuration.getMultiInstanceFailureThreshold(), configuration.getMultiInstanceMinBucketIndexPercent());
    if (recentFailuresByInstance.size() > 1 && maybeMultiCrashStart.isPresent()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeMultiCrashStart.get(), Optional.empty(), CrashLoopType.MULTI_INSTANCE_FAILURE));
    }
    if (maybeRequest.get().getRequest().isLongRunning()) {
        /*
       * Slow failures. Occasional failures, count on order of hours, looking for consistency in non-zero count each hour
       */
        getStartForFailuresInBuckets(now, recentFailuresByInstance, TimeUnit.MINUTES.toMillis(configuration.getSlowFailureBucketSizeMinutes()), configuration.getSlowFailureBuckets(), configuration.getSlowFailureThreshold(), configuration.getSlowFailureMinBucketIndexPercent()).ifPresent(start -> active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), start, Optional.empty(), CrashLoopType.SLOW_FAILURES)));
        getUnexpectedExitLoop(now, deployStatistics).ifPresent(active::add);
    }
    return active;
}
Also used : SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) IntStream(java.util.stream.IntStream) CrashLoopType(com.hubspot.singularity.CrashLoopType) SingularityRequest(com.hubspot.singularity.SingularityRequest) DeployManager(com.hubspot.singularity.data.DeployManager) TaskFailureType(com.hubspot.singularity.TaskFailureType) Inject(com.google.inject.Inject) RequestManager(com.hubspot.singularity.data.RequestManager) LoggerFactory(org.slf4j.LoggerFactory) Singleton(javax.inject.Singleton) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics) ArrayList(java.util.ArrayList) TaskFailureEvent(com.hubspot.singularity.TaskFailureEvent) Map(java.util.Map) TaskManager(com.hubspot.singularity.data.TaskManager) SingularityConfiguration(com.hubspot.singularity.config.SingularityConfiguration) SingularityTaskCleanup(com.hubspot.singularity.SingularityTaskCleanup) CrashLoopConfiguration(com.hubspot.singularity.config.CrashLoopConfiguration) Logger(org.slf4j.Logger) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) Collectors(java.util.stream.Collectors) RequestState(com.hubspot.singularity.RequestState) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) Optional(java.util.Optional) Comparator(java.util.Comparator) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) ArrayList(java.util.ArrayList) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) TaskFailureEvent(com.hubspot.singularity.TaskFailureEvent)

Example 3 with CrashLoopInfo

use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.

the class SingularityWebhookSender method checkCrashLoopUpdates.

private int checkCrashLoopUpdates(SingularityWebhook webhook, List<CompletableFuture<Response>> webhookFutures) {
    final List<CrashLoopInfo> crashLoopUpdates = webhookManager.getQueuedCrashLoopUpdatesForHook(webhook.getId());
    int numDeployUpdates = 0;
    for (CrashLoopInfo crashLoopUpdate : crashLoopUpdates) {
        String concreteUri = applyPlaceholders(webhook.getUri(), crashLoopUpdate);
        webhookFutures.add(webhookSemaphore.call(() -> executeWebhookAsync(concreteUri, crashLoopUpdate, new SingularityCrashLoopWebhookAsyncHandler(webhookManager, webhook, crashLoopUpdate, shouldDeleteUpdateOnFailure(numDeployUpdates, crashLoopUpdate.getEnd().orElse(crashLoopUpdate.getStart()))))));
    }
    return crashLoopUpdates.size();
}
Also used : CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo)

Example 4 with CrashLoopInfo

use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.

the class SnsWebhookRetryer method checkWebhooks.

public void checkWebhooks() {
    for (SingularityTaskHistoryUpdate taskHistoryUpdate : webhookManager.getTaskUpdatesToRetry()) {
        Optional<SingularityTask> task = taskHistoryHelper.getTask(taskHistoryUpdate.getTaskId());
        if (task.isPresent()) {
            snsWebhookManager.taskWebhook(new SingularityTaskWebhook(task.get(), taskHistoryUpdate));
        }
        webhookManager.deleteTaskUpdateForRetry(taskHistoryUpdate);
    }
    for (SingularityDeployUpdate deployUpdate : webhookManager.getDeployUpdatesToRetry()) {
        snsWebhookManager.deployHistoryEvent(deployUpdate);
        webhookManager.deleteDeployUpdateForRetry(deployUpdate);
    }
    for (SingularityRequestHistory requestHistory : webhookManager.getRequestUpdatesToRetry()) {
        snsWebhookManager.requestHistoryEvent(requestHistory);
        webhookManager.deleteRequestUpdateForRetry(requestHistory);
    }
    for (CrashLoopInfo crashLoopUpdate : webhookManager.getCrashLoopUpdatesToRetry()) {
        snsWebhookManager.crashLoopEvent(crashLoopUpdate);
        webhookManager.deleteCrashLoopUpdateForRetry(crashLoopUpdate);
    }
}
Also used : SingularityRequestHistory(com.hubspot.singularity.SingularityRequestHistory) SingularityTask(com.hubspot.singularity.SingularityTask) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate) SingularityDeployUpdate(com.hubspot.singularity.SingularityDeployUpdate) SingularityTaskWebhook(com.hubspot.singularity.SingularityTaskWebhook)

Example 5 with CrashLoopInfo

use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.

the class SingularityCrashLoopTest method itDetectsSlowConsistentFailureLoops.

@Test
public void itDetectsSlowConsistentFailureLoops() {
    initRequestWithType(RequestType.WORKER, false);
    initFirstDeploy();
    long now = System.currentTimeMillis();
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(30), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(60), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(90), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(120), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(150), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(180), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(210), TaskFailureType.BAD_EXIT_CODE);
    // skip one
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(270), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(300), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(330), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(360), TaskFailureType.BAD_EXIT_CODE);
    createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(390), TaskFailureType.BAD_EXIT_CODE);
    SingularityDeployStatistics deployStatistics = deployManager.getDeployStatistics(requestId, firstDeployId).get();
    List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(deployStatistics);
    Assertions.assertEquals(1, active.size());
    Assertions.assertEquals(CrashLoopType.SLOW_FAILURES, Iterables.getOnlyElement(active).getType());
}
Also used : CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics) Test(org.junit.jupiter.api.Test)

Aggregations

CrashLoopInfo (com.hubspot.singularity.CrashLoopInfo)14 SingularityDeployStatistics (com.hubspot.singularity.SingularityDeployStatistics)10 Test (org.junit.jupiter.api.Test)9 Inject (com.google.inject.Inject)3 SingularityTask (com.hubspot.singularity.SingularityTask)3 SingularityTaskCleanup (com.hubspot.singularity.SingularityTaskCleanup)3 List (java.util.List)3 Optional (java.util.Optional)3 CrashLoopType (com.hubspot.singularity.CrashLoopType)2 SingularityRequestWithState (com.hubspot.singularity.SingularityRequestWithState)2 TaskFailureType (com.hubspot.singularity.TaskFailureType)2 SingularityConfiguration (com.hubspot.singularity.config.SingularityConfiguration)2 DeployManager (com.hubspot.singularity.data.DeployManager)2 RequestManager (com.hubspot.singularity.data.RequestManager)2 Comparator (java.util.Comparator)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 Singleton (javax.inject.Singleton)2 Logger (org.slf4j.Logger)2 LoggerFactory (org.slf4j.LoggerFactory)2