use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.
the class SingularityCrashLoopChecker method checkCooldowns.
public void checkCooldowns() {
final long start = System.currentTimeMillis();
// cooldown reserved for fast loop, check crash loops separately
final List<SingularityRequestWithState> cooldownRequests = Lists.newArrayList(requestManager.getCooldownRequests(false));
AtomicInteger exitedCooldown = new AtomicInteger(0);
Map<SingularityDeployKey, Optional<SingularityDeployStatistics>> deployStatsCache = new ConcurrentHashMap<>();
if (!cooldownRequests.isEmpty()) {
CompletableFutures.allOf(cooldownRequests.stream().map(cooldownRequest -> CompletableFuture.runAsync(() -> lock.runWithRequestLock(() -> {
if (checkCooldown(cooldownRequest, deployStatsCache)) {
exitedCooldown.getAndIncrement();
}
}, cooldownRequest.getRequest().getId(), getClass().getSimpleName(), SingularitySchedulerLock.Priority.LOW), cooldownExecutor)).collect(Collectors.toList())).join();
}
// Check for crash loops
for (SingularityRequestWithState request : requestManager.getActiveRequests()) {
Optional<SingularityRequestDeployState> maybeDeployState = deployManager.getRequestDeployState(request.getRequest().getId());
if (!maybeDeployState.isPresent() || !maybeDeployState.get().getActiveDeploy().isPresent()) {
continue;
}
// Remove outdated loops on new deploy
List<CrashLoopInfo> crashLoopHistory = requestManager.getCrashLoopsForRequest(request.getRequest().getId()).stream().filter(l -> {
if (!l.getDeployId().equals(maybeDeployState.get().getActiveDeploy().get().getDeployId())) {
requestManager.deleteCrashLoop(l);
return false;
}
return true;
}).collect(Collectors.toList());
// Only keep the most recent 10 crash loop infos
crashLoopHistory.stream().filter(l -> l.getEnd().isPresent()).sorted(Comparator.comparingLong(CrashLoopInfo::getStart).reversed()).skip(10).forEach(requestManager::deleteCrashLoop);
List<CrashLoopInfo> previouslyActive = crashLoopHistory.stream().filter(l -> !l.getEnd().isPresent()).collect(Collectors.toList());
Optional<SingularityDeployStatistics> maybeDeployStatistics = deployStatsCache.computeIfAbsent(new SingularityDeployKey(request.getRequest().getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()), i -> deployManager.getDeployStatistics(request.getRequest().getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()));
if (!maybeDeployStatistics.isPresent()) {
continue;
}
List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(maybeDeployStatistics.get());
if (!active.isEmpty()) {
active.forEach(l -> {
if (previouslyActive.stream().noneMatch(l::matches)) {
LOG.info("New crash loop for {}: {}", request.getRequest().getId(), l);
requestManager.saveCrashLoop(l);
}
});
}
if (!previouslyActive.isEmpty()) {
previouslyActive.forEach(l -> {
if (active.stream().noneMatch(l::matches)) {
LOG.info("Crash loop resolved for {}: {}", request.getRequest().getId(), l);
requestManager.saveCrashLoop(new CrashLoopInfo(l.getRequestId(), l.getDeployId(), l.getStart(), Optional.of(System.currentTimeMillis()), l.getType()));
}
});
}
}
LOG.info("{} out of {} cooldown requests exited cooldown in {}", exitedCooldown.get(), cooldownRequests.size(), JavaUtils.duration(start));
}
use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.
the class SingularityCrashLoops method getActiveCrashLoops.
List<CrashLoopInfo> getActiveCrashLoops(SingularityDeployStatistics deployStatistics) {
List<CrashLoopInfo> active = new ArrayList<>();
if (deployStatistics.getTaskFailureEvents().isEmpty()) {
return active;
}
Optional<SingularityPendingDeploy> maybePending = deployManager.getPendingDeploy(deployStatistics.getRequestId());
if (maybePending.isPresent() && maybePending.get().getDeployMarker().getDeployId().equals(deployStatistics.getDeployId())) {
LOG.debug("Not checking cooldown for pending deploy {} - {}", deployStatistics.getRequestId(), deployStatistics.getDeployId());
return active;
}
Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(deployStatistics.getRequestId());
if (!maybeRequest.isPresent()) {
return active;
}
long now = System.currentTimeMillis();
// Check fast failures
Optional<Long> maybeCooldownStart = cooldownStart(deployStatistics, Optional.empty());
if (maybeCooldownStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCooldownStart.get(), Optional.empty(), CrashLoopType.FAST_FAILURE_LOOP));
}
/*
* Startup failure loop
* a) small count of failures but instance num matches one that is in cleaning state waiting for a replacement
*/
Map<Integer, Long> taskCleanStartTimes = taskManager.getCleanupTasks().stream().filter(t -> t.getTaskId().getRequestId().equals(deployStatistics.getRequestId()) && t.getTaskId().getDeployId().equals(deployStatistics.getDeployId())).collect(Collectors.toMap(t -> t.getTaskId().getInstanceNo(), SingularityTaskCleanup::getTimestamp, Math::max));
Map<Integer, List<Long>> recentStartupFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.STARTUP_FAILURE && taskCleanStartTimes.containsKey(e.getInstance())).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
boolean hasStartupFailure = false;
for (Map.Entry<Integer, List<Long>> entry : recentStartupFailures.entrySet()) {
if (taskCleanStartTimes.containsKey(entry.getKey())) {
if (entry.getValue().stream().filter(t -> t > taskCleanStartTimes.get(entry.getKey())).count() > 2) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), entry.getValue().stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
hasStartupFailure = true;
break;
}
}
}
/*
* Startup failure loop
* b) multiple instances failing healthchecks too many times in X minutes
*/
if (hasStartupFailure) {
long startupFailThreshold = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateStartupLoopOverMinutes());
List<Long> recentStartupFailTimestamps = recentStartupFailures.values().stream().flatMap(List::stream).filter(t -> t > startupFailThreshold).collect(Collectors.toList());
if (recentStartupFailTimestamps.size() > configuration.getStartupFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), recentStartupFailTimestamps.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
}
}
/*
* OOM Danger. > X OOMs in Y minutes across all instances
*/
long thresholdOomTime = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateOomsOverMinutes());
List<Long> oomFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM && e.getTimestamp() > thresholdOomTime).map(TaskFailureEvent::getTimestamp).collect(Collectors.toList());
if (oomFailures.size() >= configuration.getOomFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), oomFailures.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.OOM));
}
/*
* Single instance failure. > X failures with same instance no in X minutes, bucketed to avoid counting fast failure as one of these
* Multi instance failure. > X% of instances failing within Y minutes
*/
Map<Integer, List<Long>> recentFailuresByInstance = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM || e.getType() == TaskFailureType.BAD_EXIT_CODE || e.getType() == TaskFailureType.OUT_OF_DISK_SPACE).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
for (Map.Entry<Integer, List<Long>> entry : recentFailuresByInstance.entrySet()) {
Optional<Long> maybeCrashStart = getStartForFailuresInBuckets(now, entry.getValue(), TimeUnit.MINUTES.toMillis(configuration.getSingleInstanceFailureBucketSizeMinutes()), configuration.getSingleInstanceFailureBuckets(), configuration.getSingleInstanceFailureThreshold(), configuration.getSingleInstanceMinBucketIndexPercent());
if (maybeCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCrashStart.get(), Optional.empty(), CrashLoopType.SINGLE_INSTANCE_FAILURE_LOOP));
break;
}
}
Optional<Long> maybeMultiCrashStart = getStartForFailuresInBuckets(now, recentFailuresByInstance.values().stream().flatMap(List::stream).collect(Collectors.toList()), TimeUnit.MINUTES.toMillis(configuration.getMultiInstanceFailureBucketSizeMinutes()), configuration.getMultiInstanceFailureBuckets(), configuration.getMultiInstanceFailureThreshold(), configuration.getMultiInstanceMinBucketIndexPercent());
if (recentFailuresByInstance.size() > 1 && maybeMultiCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeMultiCrashStart.get(), Optional.empty(), CrashLoopType.MULTI_INSTANCE_FAILURE));
}
if (maybeRequest.get().getRequest().isLongRunning()) {
/*
* Slow failures. Occasional failures, count on order of hours, looking for consistency in non-zero count each hour
*/
getStartForFailuresInBuckets(now, recentFailuresByInstance, TimeUnit.MINUTES.toMillis(configuration.getSlowFailureBucketSizeMinutes()), configuration.getSlowFailureBuckets(), configuration.getSlowFailureThreshold(), configuration.getSlowFailureMinBucketIndexPercent()).ifPresent(start -> active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), start, Optional.empty(), CrashLoopType.SLOW_FAILURES)));
getUnexpectedExitLoop(now, deployStatistics).ifPresent(active::add);
}
return active;
}
use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.
the class SingularityWebhookSender method checkCrashLoopUpdates.
private int checkCrashLoopUpdates(SingularityWebhook webhook, List<CompletableFuture<Response>> webhookFutures) {
final List<CrashLoopInfo> crashLoopUpdates = webhookManager.getQueuedCrashLoopUpdatesForHook(webhook.getId());
int numDeployUpdates = 0;
for (CrashLoopInfo crashLoopUpdate : crashLoopUpdates) {
String concreteUri = applyPlaceholders(webhook.getUri(), crashLoopUpdate);
webhookFutures.add(webhookSemaphore.call(() -> executeWebhookAsync(concreteUri, crashLoopUpdate, new SingularityCrashLoopWebhookAsyncHandler(webhookManager, webhook, crashLoopUpdate, shouldDeleteUpdateOnFailure(numDeployUpdates, crashLoopUpdate.getEnd().orElse(crashLoopUpdate.getStart()))))));
}
return crashLoopUpdates.size();
}
use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.
the class SnsWebhookRetryer method checkWebhooks.
public void checkWebhooks() {
for (SingularityTaskHistoryUpdate taskHistoryUpdate : webhookManager.getTaskUpdatesToRetry()) {
Optional<SingularityTask> task = taskHistoryHelper.getTask(taskHistoryUpdate.getTaskId());
if (task.isPresent()) {
snsWebhookManager.taskWebhook(new SingularityTaskWebhook(task.get(), taskHistoryUpdate));
}
webhookManager.deleteTaskUpdateForRetry(taskHistoryUpdate);
}
for (SingularityDeployUpdate deployUpdate : webhookManager.getDeployUpdatesToRetry()) {
snsWebhookManager.deployHistoryEvent(deployUpdate);
webhookManager.deleteDeployUpdateForRetry(deployUpdate);
}
for (SingularityRequestHistory requestHistory : webhookManager.getRequestUpdatesToRetry()) {
snsWebhookManager.requestHistoryEvent(requestHistory);
webhookManager.deleteRequestUpdateForRetry(requestHistory);
}
for (CrashLoopInfo crashLoopUpdate : webhookManager.getCrashLoopUpdatesToRetry()) {
snsWebhookManager.crashLoopEvent(crashLoopUpdate);
webhookManager.deleteCrashLoopUpdateForRetry(crashLoopUpdate);
}
}
use of com.hubspot.singularity.CrashLoopInfo in project Singularity by HubSpot.
the class SingularityCrashLoopTest method itDetectsSlowConsistentFailureLoops.
@Test
public void itDetectsSlowConsistentFailureLoops() {
initRequestWithType(RequestType.WORKER, false);
initFirstDeploy();
long now = System.currentTimeMillis();
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(30), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(60), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(90), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(120), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(150), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(180), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(210), TaskFailureType.BAD_EXIT_CODE);
// skip one
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(270), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(300), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(330), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(360), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(390), TaskFailureType.BAD_EXIT_CODE);
SingularityDeployStatistics deployStatistics = deployManager.getDeployStatistics(requestId, firstDeployId).get();
List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(deployStatistics);
Assertions.assertEquals(1, active.size());
Assertions.assertEquals(CrashLoopType.SLOW_FAILURES, Iterables.getOnlyElement(active).getType());
}
Aggregations