use of com.hubspot.singularity.TaskFailureEvent in project Singularity by HubSpot.
the class SingularityCrashLoops method getActiveCrashLoops.
List<CrashLoopInfo> getActiveCrashLoops(SingularityDeployStatistics deployStatistics) {
List<CrashLoopInfo> active = new ArrayList<>();
if (deployStatistics.getTaskFailureEvents().isEmpty()) {
return active;
}
Optional<SingularityPendingDeploy> maybePending = deployManager.getPendingDeploy(deployStatistics.getRequestId());
if (maybePending.isPresent() && maybePending.get().getDeployMarker().getDeployId().equals(deployStatistics.getDeployId())) {
LOG.debug("Not checking cooldown for pending deploy {} - {}", deployStatistics.getRequestId(), deployStatistics.getDeployId());
return active;
}
Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(deployStatistics.getRequestId());
if (!maybeRequest.isPresent()) {
return active;
}
long now = System.currentTimeMillis();
// Check fast failures
Optional<Long> maybeCooldownStart = cooldownStart(deployStatistics, Optional.empty());
if (maybeCooldownStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCooldownStart.get(), Optional.empty(), CrashLoopType.FAST_FAILURE_LOOP));
}
/*
* Startup failure loop
* a) small count of failures but instance num matches one that is in cleaning state waiting for a replacement
*/
Map<Integer, Long> taskCleanStartTimes = taskManager.getCleanupTasks().stream().filter(t -> t.getTaskId().getRequestId().equals(deployStatistics.getRequestId()) && t.getTaskId().getDeployId().equals(deployStatistics.getDeployId())).collect(Collectors.toMap(t -> t.getTaskId().getInstanceNo(), SingularityTaskCleanup::getTimestamp, Math::max));
Map<Integer, List<Long>> recentStartupFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.STARTUP_FAILURE && taskCleanStartTimes.containsKey(e.getInstance())).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
boolean hasStartupFailure = false;
for (Map.Entry<Integer, List<Long>> entry : recentStartupFailures.entrySet()) {
if (taskCleanStartTimes.containsKey(entry.getKey())) {
if (entry.getValue().stream().filter(t -> t > taskCleanStartTimes.get(entry.getKey())).count() > 2) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), entry.getValue().stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
hasStartupFailure = true;
break;
}
}
}
/*
* Startup failure loop
* b) multiple instances failing healthchecks too many times in X minutes
*/
if (hasStartupFailure) {
long startupFailThreshold = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateStartupLoopOverMinutes());
List<Long> recentStartupFailTimestamps = recentStartupFailures.values().stream().flatMap(List::stream).filter(t -> t > startupFailThreshold).collect(Collectors.toList());
if (recentStartupFailTimestamps.size() > configuration.getStartupFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), recentStartupFailTimestamps.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
}
}
/*
* OOM Danger. > X OOMs in Y minutes across all instances
*/
long thresholdOomTime = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateOomsOverMinutes());
List<Long> oomFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM && e.getTimestamp() > thresholdOomTime).map(TaskFailureEvent::getTimestamp).collect(Collectors.toList());
if (oomFailures.size() >= configuration.getOomFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), oomFailures.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.OOM));
}
/*
* Single instance failure. > X failures with same instance no in X minutes, bucketed to avoid counting fast failure as one of these
* Multi instance failure. > X% of instances failing within Y minutes
*/
Map<Integer, List<Long>> recentFailuresByInstance = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM || e.getType() == TaskFailureType.BAD_EXIT_CODE || e.getType() == TaskFailureType.OUT_OF_DISK_SPACE).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
for (Map.Entry<Integer, List<Long>> entry : recentFailuresByInstance.entrySet()) {
Optional<Long> maybeCrashStart = getStartForFailuresInBuckets(now, entry.getValue(), TimeUnit.MINUTES.toMillis(configuration.getSingleInstanceFailureBucketSizeMinutes()), configuration.getSingleInstanceFailureBuckets(), configuration.getSingleInstanceFailureThreshold(), configuration.getSingleInstanceMinBucketIndexPercent());
if (maybeCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCrashStart.get(), Optional.empty(), CrashLoopType.SINGLE_INSTANCE_FAILURE_LOOP));
break;
}
}
Optional<Long> maybeMultiCrashStart = getStartForFailuresInBuckets(now, recentFailuresByInstance.values().stream().flatMap(List::stream).collect(Collectors.toList()), TimeUnit.MINUTES.toMillis(configuration.getMultiInstanceFailureBucketSizeMinutes()), configuration.getMultiInstanceFailureBuckets(), configuration.getMultiInstanceFailureThreshold(), configuration.getMultiInstanceMinBucketIndexPercent());
if (recentFailuresByInstance.size() > 1 && maybeMultiCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeMultiCrashStart.get(), Optional.empty(), CrashLoopType.MULTI_INSTANCE_FAILURE));
}
if (maybeRequest.get().getRequest().isLongRunning()) {
/*
* Slow failures. Occasional failures, count on order of hours, looking for consistency in non-zero count each hour
*/
getStartForFailuresInBuckets(now, recentFailuresByInstance, TimeUnit.MINUTES.toMillis(configuration.getSlowFailureBucketSizeMinutes()), configuration.getSlowFailureBuckets(), configuration.getSlowFailureThreshold(), configuration.getSlowFailureMinBucketIndexPercent()).ifPresent(start -> active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), start, Optional.empty(), CrashLoopType.SLOW_FAILURES)));
getUnexpectedExitLoop(now, deployStatistics).ifPresent(active::add);
}
return active;
}
use of com.hubspot.singularity.TaskFailureEvent in project Singularity by HubSpot.
the class SingularityScheduler method updateDeployStatistics.
private void updateDeployStatistics(SingularityDeployStatistics deployStatistics, SingularityTaskId taskId, Optional<SingularityTask> task, long timestamp, ExtendedTaskState state, Optional<PendingType> scheduleResult, Protos.TaskStatus status) {
SingularityDeployStatisticsBuilder bldr = deployStatistics.toBuilder();
if (!state.isFailed()) {
if (bldr.getAverageRuntimeMillis().isPresent()) {
long newAvgRuntimeMillis = (bldr.getAverageRuntimeMillis().get() * bldr.getNumTasks() + (timestamp - taskId.getStartedAt())) / (bldr.getNumTasks() + 1);
bldr.setAverageRuntimeMillis(Optional.of(newAvgRuntimeMillis));
} else {
bldr.setAverageRuntimeMillis(Optional.of(timestamp - taskId.getStartedAt()));
}
}
if (task.isPresent()) {
long dueTime = task.get().getTaskRequest().getPendingTask().getPendingTaskId().getNextRunAt();
long startedAt = taskId.getStartedAt();
if (bldr.getAverageSchedulingDelayMillis().isPresent()) {
long newAverageSchedulingDelayMillis = (bldr.getAverageSchedulingDelayMillis().get() * bldr.getNumTasks() + (startedAt - dueTime)) / (bldr.getNumTasks() + 1);
bldr.setAverageSchedulingDelayMillis(Optional.of(newAverageSchedulingDelayMillis));
} else {
bldr.setAverageSchedulingDelayMillis(Optional.of(startedAt - dueTime));
}
}
bldr.setNumTasks(bldr.getNumTasks() + 1);
if (!bldr.getLastFinishAt().isPresent() || timestamp > bldr.getLastFinishAt().get()) {
bldr.setLastFinishAt(Optional.of(timestamp));
bldr.setLastTaskState(Optional.of(state));
}
if (task.isPresent() && task.get().getTaskRequest().getRequest().isLongRunning() && state == ExtendedTaskState.TASK_FINISHED) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.UNEXPECTED_EXIT));
}
if (state == ExtendedTaskState.TASK_KILLED) {
if (status.hasMessage()) {
Optional<TaskCleanupType> maybeCleanupType = getCleanupType(taskId, status.getMessage());
if (maybeCleanupType.isPresent() && (maybeCleanupType.get() == TaskCleanupType.OVERDUE_NEW_TASK || maybeCleanupType.get() == TaskCleanupType.UNHEALTHY_NEW_TASK)) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.STARTUP_FAILURE));
}
}
}
if (!state.isSuccess()) {
if (SingularityTaskHistoryUpdate.getUpdate(taskManager.getTaskHistoryUpdates(taskId), ExtendedTaskState.TASK_CLEANING).isPresent()) {
LOG.debug("{} failed with {} after cleaning - ignoring it for cooldown/crash loop", taskId, state);
} else {
if (state.isFailed()) {
if ((status.hasMessage() && status.getMessage().contains("Memory limit exceeded")) || (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY)) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.OOM));
} else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_DISK) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.OUT_OF_DISK_SPACE));
} else {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.BAD_EXIT_CODE));
}
}
if (state == ExtendedTaskState.TASK_LOST && status.hasReason()) {
if (isMesosError(status.getReason())) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.MESOS_ERROR));
} else if (isLostAgent(status.getReason())) {
bldr.addTaskFailureEvent(new TaskFailureEvent(taskId.getInstanceNo(), timestamp, TaskFailureType.LOST_SLAVE));
}
}
bldr.setNumSuccess(0);
bldr.setNumFailures(bldr.getNumFailures() + 1);
}
} else {
bldr.setNumSuccess(bldr.getNumSuccess() + 1);
bldr.setNumFailures(0);
}
if (scheduleResult.isPresent() && scheduleResult.get() == PendingType.RETRY) {
bldr.setNumSequentialRetries(bldr.getNumSequentialRetries() + 1);
} else {
bldr.setNumSequentialRetries(0);
}
bldr.trimTaskFailureEvents(50);
final SingularityDeployStatistics newStatistics = bldr.build();
LOG.trace("Saving new deploy statistics {}", newStatistics);
deployManager.saveDeployStatistics(newStatistics);
}
Aggregations