use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.
the class SingularityDeployChecker method updatePendingDeploy.
private void updatePendingDeploy(SingularityPendingDeploy pendingDeploy, Optional<SingularityLoadBalancerUpdate> lbUpdate, DeployState deployState, Optional<SingularityDeployProgress> deployProgress) {
SingularityPendingDeploy copy = new SingularityPendingDeploy(pendingDeploy.getDeployMarker(), lbUpdate, deployState, deployProgress, pendingDeploy.getUpdatedRequest());
deployManager.savePendingDeploy(copy);
}
use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.
the class SingularityDeployAcceptanceTest method testLbRevertsAfterFailedAcceptanceStepOnNonCanary.
@Test
public void testLbRevertsAfterFailedAcceptanceStepOnNonCanary() {
NoopDeployAcceptanceHook hook = (NoopDeployAcceptanceHook) acceptanceHooks.iterator().next();
hook.setNextResult(new DeployAcceptanceResult(DeployAcceptanceState.FAILED, "ruh-roh"));
initLoadBalancedRequest();
initFirstDeploy();
SingularityTask firstTask = launchTask(request, firstDeploy, 1, TaskState.TASK_RUNNING);
SingularityDeployBuilder builder = new SingularityDeployBuilder(requestId, secondDeployId);
builder.setCommand(Optional.of("sleep 1")).setCanaryDeploySettings(CanaryDeploySettings.newbuilder().setAcceptanceMode(DeployAcceptanceMode.CHECKS).setEnableCanaryDeploy(false).build()).setServiceBasePath(Optional.of("/basepath")).setLoadBalancerGroups(Optional.of(Collections.singleton("group")));
deployResource.deploy(new SingularityDeployRequest(builder.build(), Optional.of(false), Optional.empty()), singularityUser);
deployChecker.checkDeploys();
scheduler.drainPendingQueue();
Assertions.assertEquals(1, taskManager.getPendingTaskIds().size());
resourceOffers();
Assertions.assertEquals(1, taskManager.getActiveTaskIdsForDeploy(requestId, secondDeployId).size());
SingularityTaskId firstNewTaskId = taskManager.getActiveTaskIdsForDeploy(requestId, secondDeployId).get(0);
statusUpdate(taskManager.getTask(firstNewTaskId).get(), TaskState.TASK_RUNNING);
deployChecker.checkDeploys();
SingularityPendingDeploy pendingDeploy = deployManager.getPendingDeploy(requestId).get();
Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
testingLbClient.setNextRequestState(LoadBalancerRequestState.WAITING);
deployChecker.checkDeploys();
pendingDeploy = deployManager.getPendingDeploy(requestId).get();
Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
testingLbClient.setNextRequestState(LoadBalancerRequestState.SUCCESS);
deployChecker.checkDeploys();
// Acceptance checks fail
testingLbClient.setNextRequestState(LoadBalancerRequestState.WAITING);
deployChecker.checkDeploys();
pendingDeploy = deployManager.getPendingDeploy(requestId).get();
Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
Assertions.assertEquals(DeployAcceptanceState.FAILED, pendingDeploy.getDeployProgress().getStepAcceptanceResults().entrySet().iterator().next().getValue());
SingularityDeployProgress deployProgress = pendingDeploy.getDeployProgress();
DeployProgressLbUpdateHolder lbUpdateHolder = deployProgress.getLbUpdates().get(deployProgress.getPendingLbUpdate().get().getLoadBalancerRequestId().toString());
Assertions.assertTrue(lbUpdateHolder.getAdded().contains(firstTask.getTaskId()));
Assertions.assertTrue(lbUpdateHolder.getRemoved().contains(firstNewTaskId));
testingLbClient.setNextRequestState(LoadBalancerRequestState.SUCCESS);
deployChecker.checkDeploys();
SingularityDeployResult deployResult = deployManager.getDeployResult(requestId, secondDeployId).get();
Assertions.assertEquals(DeployState.FAILED, deployResult.getDeployState());
Assertions.assertTrue(deployResult.getMessage().get().contains("ruh-roh"));
}
use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.
the class SingularitySchedulerTestBase method startDeploy.
protected void startDeploy(SingularityDeployMarker deployMarker, long timestamp) {
SingularityDeployProgress startingDeployProgress = SingularityDeployProgress.forNewDeploy(1, timestamp, false);
deployManager.savePendingDeploy(new SingularityPendingDeploy(deployMarker, DeployState.WAITING, startingDeployProgress, Optional.empty()));
}
use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.
the class RequestResource method submitRequest.
private void submitRequest(SingularityRequest request, Optional<SingularityRequestWithState> oldRequestWithState, Optional<RequestHistoryType> historyType, Optional<Boolean> skipHealthchecks, Optional<String> message, Optional<SingularityBounceRequest> maybeBounceRequest, SingularityUser user) {
checkNotNullBadRequest(request.getId(), "Request must have an id");
checkConflict(!requestManager.cleanupRequestExists(request.getId()), "Request %s is currently cleaning. Try again after a few moments", request.getId());
Optional<SingularityPendingDeploy> maybePendingDeploy = deployManager.getPendingDeploy(request.getId());
checkConflict(!(maybePendingDeploy.isPresent() && maybePendingDeploy.get().getUpdatedRequest().isPresent()), "Request %s has a pending deploy that may change the request data. Try again when the deploy has finished", request.getId());
Optional<SingularityRequest> oldRequest = oldRequestWithState.isPresent() ? Optional.of(oldRequestWithState.get().getRequest()) : Optional.<SingularityRequest>empty();
authorizationHelper.checkForAuthorizedChanges(request, oldRequest, user);
if (oldRequest.isPresent()) {
authorizationHelper.checkForAuthorization(oldRequest.get(), user, SingularityAuthorizationScope.WRITE);
validator.checkActionEnabled(SingularityAction.UPDATE_REQUEST);
} else {
validator.checkActionEnabled(SingularityAction.CREATE_REQUEST);
}
if (request.getAgentPlacement().isPresent() && (request.getAgentPlacement().get() == AgentPlacement.SPREAD_ALL_SLAVES || request.getAgentPlacement().get() == AgentPlacement.SPREAD_ALL_AGENTS)) {
checkBadRequest(validator.isSpreadAllAgentsEnabled(), "You must enabled spread to all agents in order to use the SPREAD_ALL_AGENTS request type");
int currentActiveAgentCount = agentManager.getNumObjectsAtState(MachineState.ACTIVE);
request = request.toBuilder().setInstances(Optional.of(currentActiveAgentCount)).build();
}
if (!oldRequest.isPresent() || !(oldRequest.get().getInstancesSafe() == request.getInstancesSafe())) {
validator.checkScale(request, Optional.empty());
}
authorizationHelper.checkForAuthorization(request, user, SingularityAuthorizationScope.WRITE);
RequestState requestState = RequestState.ACTIVE;
if (oldRequestWithState.isPresent()) {
requestState = oldRequestWithState.get().getState();
}
if (oldRequest.isPresent() && request.getInstancesSafe() < oldRequest.get().getInstancesSafe()) {
// Trigger cleanups for scale down
int newInstances = request.getInstancesSafe();
Optional<SingularityRequestDeployState> maybeDeployState = deployManager.getRequestDeployState(request.getId());
if (maybeDeployState.isPresent() && maybeDeployState.get().getActiveDeploy().isPresent()) {
List<SingularityTaskId> remainingActiveTasks = new ArrayList<>();
taskManager.getActiveTaskIdsForDeploy(request.getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()).forEach(taskId -> {
if (taskId.getInstanceNo() > newInstances) {
taskManager.createTaskCleanup(new SingularityTaskCleanup(Optional.of(user.getId()), TaskCleanupType.SCALING_DOWN, System.currentTimeMillis(), taskId, message, Optional.of(UUID.randomUUID().toString()), Optional.empty()));
} else {
remainingActiveTasks.add(taskId);
}
});
int activeRacksWithCapacityCount = agentAndRackManager.getActiveRacksWithCapacityCount();
if (oldRequest.get().getInstancesSafe() > activeRacksWithCapacityCount) {
if (request.isRackSensitive() && configuration.isRebalanceRacksOnScaleDown()) {
rebalancingHelper.rebalanceRacks(request, remainingActiveTasks, user.getEmail());
}
}
if (request.getAgentAttributeMinimums().isPresent()) {
Set<SingularityTaskId> cleanedTasks = rebalancingHelper.rebalanceAttributeDistribution(request, user.getEmail(), remainingActiveTasks);
remainingActiveTasks.removeAll(cleanedTasks);
}
}
}
if (oldRequest.isPresent() && !oldRequest.get().getSkipHealthchecks().orElse(false) && request.getSkipHealthchecks().orElse(false)) {
LOG.info("Marking pending tasks as healthy for skipHealthchecks on {}", request.getId());
taskManager.getActiveTaskIdsForRequest(request.getId()).forEach(t -> {
// Will only be saved if async healthchecks have not already finished
taskManager.saveHealthcheckResult(new SingularityTaskHealthcheckResult(Optional.of(200), Optional.empty(), System.currentTimeMillis(), Optional.of(String.format("Healthchecks skipped by %s", user.getId())), Optional.empty(), t, Optional.empty()));
});
}
requestHelper.updateRequest(request, oldRequest, requestState, historyType, user.getEmail(), skipHealthchecks, message, maybeBounceRequest);
}
use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.
the class SingularityCrashLoops method getActiveCrashLoops.
List<CrashLoopInfo> getActiveCrashLoops(SingularityDeployStatistics deployStatistics) {
List<CrashLoopInfo> active = new ArrayList<>();
if (deployStatistics.getTaskFailureEvents().isEmpty()) {
return active;
}
Optional<SingularityPendingDeploy> maybePending = deployManager.getPendingDeploy(deployStatistics.getRequestId());
if (maybePending.isPresent() && maybePending.get().getDeployMarker().getDeployId().equals(deployStatistics.getDeployId())) {
LOG.debug("Not checking cooldown for pending deploy {} - {}", deployStatistics.getRequestId(), deployStatistics.getDeployId());
return active;
}
Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(deployStatistics.getRequestId());
if (!maybeRequest.isPresent()) {
return active;
}
long now = System.currentTimeMillis();
// Check fast failures
Optional<Long> maybeCooldownStart = cooldownStart(deployStatistics, Optional.empty());
if (maybeCooldownStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCooldownStart.get(), Optional.empty(), CrashLoopType.FAST_FAILURE_LOOP));
}
/*
* Startup failure loop
* a) small count of failures but instance num matches one that is in cleaning state waiting for a replacement
*/
Map<Integer, Long> taskCleanStartTimes = taskManager.getCleanupTasks().stream().filter(t -> t.getTaskId().getRequestId().equals(deployStatistics.getRequestId()) && t.getTaskId().getDeployId().equals(deployStatistics.getDeployId())).collect(Collectors.toMap(t -> t.getTaskId().getInstanceNo(), SingularityTaskCleanup::getTimestamp, Math::max));
Map<Integer, List<Long>> recentStartupFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.STARTUP_FAILURE && taskCleanStartTimes.containsKey(e.getInstance())).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
boolean hasStartupFailure = false;
for (Map.Entry<Integer, List<Long>> entry : recentStartupFailures.entrySet()) {
if (taskCleanStartTimes.containsKey(entry.getKey())) {
if (entry.getValue().stream().filter(t -> t > taskCleanStartTimes.get(entry.getKey())).count() > 2) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), entry.getValue().stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
hasStartupFailure = true;
break;
}
}
}
/*
* Startup failure loop
* b) multiple instances failing healthchecks too many times in X minutes
*/
if (hasStartupFailure) {
long startupFailThreshold = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateStartupLoopOverMinutes());
List<Long> recentStartupFailTimestamps = recentStartupFailures.values().stream().flatMap(List::stream).filter(t -> t > startupFailThreshold).collect(Collectors.toList());
if (recentStartupFailTimestamps.size() > configuration.getStartupFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), recentStartupFailTimestamps.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
}
}
/*
* OOM Danger. > X OOMs in Y minutes across all instances
*/
long thresholdOomTime = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateOomsOverMinutes());
List<Long> oomFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM && e.getTimestamp() > thresholdOomTime).map(TaskFailureEvent::getTimestamp).collect(Collectors.toList());
if (oomFailures.size() >= configuration.getOomFailureThreshold()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), oomFailures.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.OOM));
}
/*
* Single instance failure. > X failures with same instance no in X minutes, bucketed to avoid counting fast failure as one of these
* Multi instance failure. > X% of instances failing within Y minutes
*/
Map<Integer, List<Long>> recentFailuresByInstance = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM || e.getType() == TaskFailureType.BAD_EXIT_CODE || e.getType() == TaskFailureType.OUT_OF_DISK_SPACE).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
for (Map.Entry<Integer, List<Long>> entry : recentFailuresByInstance.entrySet()) {
Optional<Long> maybeCrashStart = getStartForFailuresInBuckets(now, entry.getValue(), TimeUnit.MINUTES.toMillis(configuration.getSingleInstanceFailureBucketSizeMinutes()), configuration.getSingleInstanceFailureBuckets(), configuration.getSingleInstanceFailureThreshold(), configuration.getSingleInstanceMinBucketIndexPercent());
if (maybeCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCrashStart.get(), Optional.empty(), CrashLoopType.SINGLE_INSTANCE_FAILURE_LOOP));
break;
}
}
Optional<Long> maybeMultiCrashStart = getStartForFailuresInBuckets(now, recentFailuresByInstance.values().stream().flatMap(List::stream).collect(Collectors.toList()), TimeUnit.MINUTES.toMillis(configuration.getMultiInstanceFailureBucketSizeMinutes()), configuration.getMultiInstanceFailureBuckets(), configuration.getMultiInstanceFailureThreshold(), configuration.getMultiInstanceMinBucketIndexPercent());
if (recentFailuresByInstance.size() > 1 && maybeMultiCrashStart.isPresent()) {
active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeMultiCrashStart.get(), Optional.empty(), CrashLoopType.MULTI_INSTANCE_FAILURE));
}
if (maybeRequest.get().getRequest().isLongRunning()) {
/*
* Slow failures. Occasional failures, count on order of hours, looking for consistency in non-zero count each hour
*/
getStartForFailuresInBuckets(now, recentFailuresByInstance, TimeUnit.MINUTES.toMillis(configuration.getSlowFailureBucketSizeMinutes()), configuration.getSlowFailureBuckets(), configuration.getSlowFailureThreshold(), configuration.getSlowFailureMinBucketIndexPercent()).ifPresent(start -> active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), start, Optional.empty(), CrashLoopType.SLOW_FAILURES)));
getUnexpectedExitLoop(now, deployStatistics).ifPresent(active::add);
}
return active;
}
Aggregations