use of com.hubspot.singularity.SingularityDeployStatistics in project Singularity by HubSpot.
the class SingularityCrashLoopTest method itDetectsFastFailureLoopsForNonLongRunning.
@Test
public void itDetectsFastFailureLoopsForNonLongRunning() {
initRequestWithType(RequestType.ON_DEMAND, false);
initFirstDeploy();
long now = System.currentTimeMillis();
createTaskFailure(1, now - 1000, TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - 10000, TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - 20000, TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - 30000, TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - 45000, TaskFailureType.BAD_EXIT_CODE);
SingularityDeployStatistics deployStatistics = deployManager.getDeployStatistics(requestId, firstDeployId).get();
List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(deployStatistics);
Assertions.assertEquals(1, active.size());
Assertions.assertEquals(CrashLoopType.FAST_FAILURE_LOOP, Iterables.getOnlyElement(active).getType());
}
use of com.hubspot.singularity.SingularityDeployStatistics in project Singularity by HubSpot.
the class SingularityCrashLoopTest method itDoesNotTriggerWhenFailuresAreNotRecentEnough.
@Test
public void itDoesNotTriggerWhenFailuresAreNotRecentEnough() {
initRequestWithType(RequestType.WORKER, false);
initFirstDeploy();
long now = System.currentTimeMillis();
// 3 failures meets threshold, but latest must be < ~8mins ago for single instance fail loop
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(10), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(15), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(20), TaskFailureType.BAD_EXIT_CODE);
SingularityDeployStatistics deployStatistics = deployManager.getDeployStatistics(requestId, firstDeployId).get();
List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(deployStatistics);
Assertions.assertTrue(active.isEmpty());
}
use of com.hubspot.singularity.SingularityDeployStatistics in project Singularity by HubSpot.
the class SingularityCrashLoopTest method itDetectsTooManyMultiInstanceFailures.
@Test
public void itDetectsTooManyMultiInstanceFailures() {
initRequestWithType(RequestType.WORKER, false);
initFirstDeploy();
long now = System.currentTimeMillis();
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(1), TaskFailureType.BAD_EXIT_CODE);
createTaskFailure(2, now - TimeUnit.MINUTES.toMillis(4), TaskFailureType.OOM);
createTaskFailure(6, now - TimeUnit.MINUTES.toMillis(5), TaskFailureType.OUT_OF_DISK_SPACE);
createTaskFailure(3, now - TimeUnit.MINUTES.toMillis(7), TaskFailureType.OUT_OF_DISK_SPACE);
createTaskFailure(4, now - TimeUnit.MINUTES.toMillis(10), TaskFailureType.OOM);
createTaskFailure(1, now - TimeUnit.MINUTES.toMillis(12), TaskFailureType.OUT_OF_DISK_SPACE);
createTaskFailure(5, now - TimeUnit.MINUTES.toMillis(16), TaskFailureType.BAD_EXIT_CODE);
SingularityDeployStatistics deployStatistics = deployManager.getDeployStatistics(requestId, firstDeployId).get();
List<CrashLoopInfo> active = crashLoops.getActiveCrashLoops(deployStatistics);
Assertions.assertEquals(1, active.size());
Assertions.assertEquals(CrashLoopType.MULTI_INSTANCE_FAILURE, Iterables.getOnlyElement(active).getType());
}
use of com.hubspot.singularity.SingularityDeployStatistics in project Singularity by HubSpot.
the class SingularityMesosOfferScheduler method checkOffers.
Collection<SingularityOfferHolder> checkOffers(final Map<String, Offer> offers, long start) {
if (offers.isEmpty()) {
LOG.debug("No offers to check");
return Collections.emptyList();
}
final List<SingularityTaskRequestHolder> sortedTaskRequestHolders = getSortedDueTaskRequests();
final int numDueTasks = sortedTaskRequestHolders.size();
final Map<String, SingularityOfferHolder> offerHolders = offers.values().stream().collect(Collectors.groupingBy(o -> o.getAgentId().getValue())).entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> {
List<Offer> offersList = e.getValue();
String agentId = e.getKey();
return new SingularityOfferHolder(offersList, numDueTasks, agentAndRackHelper.getRackIdOrDefault(offersList.get(0)), agentId, offersList.get(0).getHostname(), agentAndRackHelper.getTextAttributes(offersList.get(0)), agentAndRackHelper.getReservedAgentAttributes(offersList.get(0)));
}).collect(Collectors.toMap(SingularityOfferHolder::getAgentId, Function.identity()));
if (sortedTaskRequestHolders.isEmpty()) {
return offerHolders.values();
}
final AtomicInteger tasksScheduled = new AtomicInteger(0);
Map<String, RequestUtilization> requestUtilizations = usageManager.getRequestUtilizations(false);
List<SingularityTaskId> activeTaskIds = taskManager.getActiveTaskIds();
Map<String, SingularityAgentUsageWithId> currentUsages = usageManager.getAllCurrentAgentUsage();
List<CompletableFuture<Void>> currentUsagesFutures = new ArrayList<>();
for (SingularityOfferHolder offerHolder : offerHolders.values()) {
currentUsagesFutures.add(runAsync(() -> {
String agentId = offerHolder.getAgentId();
Optional<SingularityAgentUsageWithId> maybeUsage = Optional.ofNullable(currentUsages.get(agentId));
if (configuration.isReCheckMetricsForLargeNewTaskCount() && maybeUsage.isPresent()) {
long newTaskCount = taskManager.getActiveTaskIds().stream().filter(t -> t.getStartedAt() > maybeUsage.get().getTimestamp() && t.getSanitizedHost().equals(offerHolder.getSanitizedHost())).count();
if (newTaskCount >= maybeUsage.get().getNumTasks() / 2) {
try {
MesosAgentMetricsSnapshotObject metricsSnapshot = usageHelper.getMetricsSnapshot(offerHolder.getHostname());
if (metricsSnapshot.getSystemLoad5Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad1Threshold() || metricsSnapshot.getSystemLoad1Min() / metricsSnapshot.getSystemCpusTotal() > mesosConfiguration.getRecheckMetricsLoad5Threshold()) {
// Come back to this agent after we have collected more metrics
LOG.info("Skipping evaluation of {} until new metrics are collected. Current load is load1: {}, load5: {}", offerHolder.getHostname(), metricsSnapshot.getSystemLoad1Min(), metricsSnapshot.getSystemLoad5Min());
currentUsages.remove(agentId);
}
} catch (Throwable t) {
LOG.warn("Could not check metrics for host {}, skipping", offerHolder.getHostname());
currentUsages.remove(agentId);
}
}
}
}));
}
CompletableFutures.allOf(currentUsagesFutures).join();
List<CompletableFuture<Void>> usagesWithScoresFutures = new ArrayList<>();
Map<String, SingularityAgentUsageWithCalculatedScores> currentUsagesById = new ConcurrentHashMap<>();
for (SingularityAgentUsageWithId usage : currentUsages.values()) {
if (offerHolders.containsKey(usage.getAgentId())) {
usagesWithScoresFutures.add(runAsync(() -> currentUsagesById.put(usage.getAgentId(), new SingularityAgentUsageWithCalculatedScores(usage, mesosConfiguration.getScoreUsingSystemLoad(), getMaxProbableUsageForAgent(activeTaskIds, requestUtilizations, offerHolders.get(usage.getAgentId()).getSanitizedHost()), mesosConfiguration.getLoad5OverloadedThreshold(), mesosConfiguration.getLoad1OverloadedThreshold(), usage.getTimestamp()))));
}
}
CompletableFutures.allOf(usagesWithScoresFutures).join();
long startCheck = System.currentTimeMillis();
LOG.debug("Found agent usages and scores after {}ms", startCheck - start);
Map<SingularityDeployKey, Optional<SingularityDeployStatistics>> deployStatsCache = new ConcurrentHashMap<>();
Set<String> overloadedHosts = Sets.newConcurrentHashSet();
AtomicInteger noMatches = new AtomicInteger();
// We spend much of the offer check loop for request level locks. Wait for the locks in parallel, but ensure that actual offer checks
// are done in serial to not over commit a single offer
ReentrantLock offerCheckTempLock = new ReentrantLock(false);
CompletableFutures.allOf(sortedTaskRequestHolders.stream().collect(Collectors.groupingBy(t -> t.getTaskRequest().getRequest().getId())).entrySet().stream().map(entry -> runAsync(() -> {
lock.tryRunWithRequestLock(() -> {
offerCheckTempLock.lock();
try {
long startRequest = System.currentTimeMillis();
int evaluated = 0;
for (SingularityTaskRequestHolder taskRequestHolder : entry.getValue()) {
long now = System.currentTimeMillis();
boolean isOfferLoopTakingTooLong = now - startCheck > mesosConfiguration.getOfferLoopTimeoutMillis();
boolean isRequestInOfferLoopTakingTooLong = (now - startRequest > mesosConfiguration.getOfferLoopRequestTimeoutMillis() && evaluated > 1);
if (isOfferLoopTakingTooLong || isRequestInOfferLoopTakingTooLong) {
LOG.warn("{} is holding the offer lock for too long, skipping remaining {} tasks for scheduling", taskRequestHolder.getTaskRequest().getRequest().getId(), entry.getValue().size() - evaluated);
break;
}
evaluated++;
List<SingularityTaskId> activeTaskIdsForRequest = leaderCache.getActiveTaskIdsForRequest(taskRequestHolder.getTaskRequest().getRequest().getId());
if (isTooManyInstancesForRequest(taskRequestHolder.getTaskRequest(), activeTaskIdsForRequest)) {
LOG.debug("Skipping pending task {}, too many instances already running", taskRequestHolder.getTaskRequest().getPendingTask().getPendingTaskId());
continue;
}
Map<String, Double> scorePerOffer = new ConcurrentHashMap<>();
for (SingularityOfferHolder offerHolder : offerHolders.values()) {
if (!isOfferFull(offerHolder)) {
if (calculateScore(requestUtilizations, currentUsagesById, taskRequestHolder, scorePerOffer, activeTaskIdsForRequest, offerHolder, deployStatsCache, overloadedHosts) > mesosConfiguration.getGoodEnoughScoreThreshold()) {
break;
}
}
}
if (!scorePerOffer.isEmpty()) {
SingularityOfferHolder bestOffer = offerHolders.get(Collections.max(scorePerOffer.entrySet(), Map.Entry.comparingByValue()).getKey());
LOG.info("Best offer {}/1 is on {}", scorePerOffer.get(bestOffer.getAgentId()), bestOffer.getSanitizedHost());
acceptTask(bestOffer, taskRequestHolder);
tasksScheduled.getAndIncrement();
updateAgentUsageScores(taskRequestHolder, currentUsagesById, bestOffer.getAgentId(), requestUtilizations);
} else {
noMatches.getAndIncrement();
}
}
} finally {
offerCheckTempLock.unlock();
}
}, entry.getKey(), String.format("%s#%s", getClass().getSimpleName(), "checkOffers"), mesosConfiguration.getOfferLoopRequestTimeoutMillis(), TimeUnit.MILLISECONDS);
})).collect(Collectors.toList())).join();
LOG.info("{} tasks scheduled, {} tasks remaining after examining {} offers ({} overloaded hosts, {} had no offer matches)", tasksScheduled, numDueTasks - tasksScheduled.get(), offers.size(), overloadedHosts.size(), noMatches.get());
return offerHolders.values();
}
use of com.hubspot.singularity.SingularityDeployStatistics in project Singularity by HubSpot.
the class SingularityScheduler method handleCompletedTask.
@Timed
public void handleCompletedTask(Optional<SingularityTask> task, SingularityTaskId taskId, boolean wasActive, long timestamp, ExtendedTaskState state, SingularityCreateResult taskHistoryUpdateCreateResult, Protos.TaskStatus status) {
final SingularityDeployStatistics deployStatistics = getDeployStatistics(taskId.getRequestId(), taskId.getDeployId());
if (wasActive) {
taskManager.deleteActiveTask(taskId.getId());
}
if (!task.isPresent() || task.get().getTaskRequest().getRequest().isLoadBalanced()) {
taskManager.createLBCleanupTask(taskId);
}
if (requestManager.isBouncing(taskId.getRequestId())) {
List<SingularityTaskId> activeTaskIds = taskManager.getActiveTaskIdsForRequest(taskId.getRequestId());
boolean foundBouncingTask = false;
for (SingularityTaskId activeTaskId : activeTaskIds) {
Optional<SingularityTaskHistoryUpdate> maybeCleaningUpdate = taskManager.getTaskHistoryUpdate(activeTaskId, ExtendedTaskState.TASK_CLEANING);
if (maybeCleaningUpdate.isPresent()) {
if (maybeCleaningUpdate.get().getStatusReason().or("").contains("BOUNCE")) {
// TaskCleanupType enum is included in status message
LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
foundBouncingTask = true;
break;
} else if (!maybeCleaningUpdate.get().getPrevious().isEmpty()) {
for (SingularityTaskHistoryUpdate previousUpdate : maybeCleaningUpdate.get().getPrevious()) {
if (previousUpdate.getStatusMessage().or("").contains("BOUNCE")) {
LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
foundBouncingTask = true;
break;
}
}
}
}
}
if (!foundBouncingTask) {
LOG.info("Bounce completed for request {}, no cleaning tasks due to bounce found", taskId.getRequestId());
Optional<SingularityExpiringBounce> expiringBounce = requestManager.getExpiringBounce(taskId.getRequestId());
if (expiringBounce.isPresent() && expiringBounce.get().getDeployId().equals(taskId.getDeployId())) {
requestManager.deleteExpiringObject(SingularityExpiringBounce.class, taskId.getRequestId());
}
requestManager.markBounceComplete(taskId.getRequestId());
}
}
final Optional<PendingType> scheduleResult = handleCompletedTaskWithStatistics(task, taskId, timestamp, state, deployStatistics, taskHistoryUpdateCreateResult, status);
if (taskHistoryUpdateCreateResult == SingularityCreateResult.EXISTED) {
return;
}
updateDeployStatistics(deployStatistics, taskId, task, timestamp, state, scheduleResult);
}
Aggregations