Search in sources :

Example 1 with SingularityPendingDeploy

use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.

the class SingularityDeployChecker method updatePendingDeploy.

private void updatePendingDeploy(SingularityPendingDeploy pendingDeploy, Optional<SingularityLoadBalancerUpdate> lbUpdate, DeployState deployState, Optional<SingularityDeployProgress> deployProgress) {
    SingularityPendingDeploy copy = new SingularityPendingDeploy(pendingDeploy.getDeployMarker(), lbUpdate, deployState, deployProgress, pendingDeploy.getUpdatedRequest());
    deployManager.savePendingDeploy(copy);
}
Also used : SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy)

Example 2 with SingularityPendingDeploy

use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.

the class SingularityDeployAcceptanceTest method testLbRevertsAfterFailedAcceptanceStepOnNonCanary.

@Test
public void testLbRevertsAfterFailedAcceptanceStepOnNonCanary() {
    NoopDeployAcceptanceHook hook = (NoopDeployAcceptanceHook) acceptanceHooks.iterator().next();
    hook.setNextResult(new DeployAcceptanceResult(DeployAcceptanceState.FAILED, "ruh-roh"));
    initLoadBalancedRequest();
    initFirstDeploy();
    SingularityTask firstTask = launchTask(request, firstDeploy, 1, TaskState.TASK_RUNNING);
    SingularityDeployBuilder builder = new SingularityDeployBuilder(requestId, secondDeployId);
    builder.setCommand(Optional.of("sleep 1")).setCanaryDeploySettings(CanaryDeploySettings.newbuilder().setAcceptanceMode(DeployAcceptanceMode.CHECKS).setEnableCanaryDeploy(false).build()).setServiceBasePath(Optional.of("/basepath")).setLoadBalancerGroups(Optional.of(Collections.singleton("group")));
    deployResource.deploy(new SingularityDeployRequest(builder.build(), Optional.of(false), Optional.empty()), singularityUser);
    deployChecker.checkDeploys();
    scheduler.drainPendingQueue();
    Assertions.assertEquals(1, taskManager.getPendingTaskIds().size());
    resourceOffers();
    Assertions.assertEquals(1, taskManager.getActiveTaskIdsForDeploy(requestId, secondDeployId).size());
    SingularityTaskId firstNewTaskId = taskManager.getActiveTaskIdsForDeploy(requestId, secondDeployId).get(0);
    statusUpdate(taskManager.getTask(firstNewTaskId).get(), TaskState.TASK_RUNNING);
    deployChecker.checkDeploys();
    SingularityPendingDeploy pendingDeploy = deployManager.getPendingDeploy(requestId).get();
    Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
    testingLbClient.setNextRequestState(LoadBalancerRequestState.WAITING);
    deployChecker.checkDeploys();
    pendingDeploy = deployManager.getPendingDeploy(requestId).get();
    Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
    testingLbClient.setNextRequestState(LoadBalancerRequestState.SUCCESS);
    deployChecker.checkDeploys();
    // Acceptance checks fail
    testingLbClient.setNextRequestState(LoadBalancerRequestState.WAITING);
    deployChecker.checkDeploys();
    pendingDeploy = deployManager.getPendingDeploy(requestId).get();
    Assertions.assertEquals(DeployState.WAITING, pendingDeploy.getCurrentDeployState());
    Assertions.assertEquals(DeployAcceptanceState.FAILED, pendingDeploy.getDeployProgress().getStepAcceptanceResults().entrySet().iterator().next().getValue());
    SingularityDeployProgress deployProgress = pendingDeploy.getDeployProgress();
    DeployProgressLbUpdateHolder lbUpdateHolder = deployProgress.getLbUpdates().get(deployProgress.getPendingLbUpdate().get().getLoadBalancerRequestId().toString());
    Assertions.assertTrue(lbUpdateHolder.getAdded().contains(firstTask.getTaskId()));
    Assertions.assertTrue(lbUpdateHolder.getRemoved().contains(firstNewTaskId));
    testingLbClient.setNextRequestState(LoadBalancerRequestState.SUCCESS);
    deployChecker.checkDeploys();
    SingularityDeployResult deployResult = deployManager.getDeployResult(requestId, secondDeployId).get();
    Assertions.assertEquals(DeployState.FAILED, deployResult.getDeployState());
    Assertions.assertTrue(deployResult.getMessage().get().contains("ruh-roh"));
}
Also used : SingularityDeployRequest(com.hubspot.singularity.api.SingularityDeployRequest) SingularityTask(com.hubspot.singularity.SingularityTask) DeployProgressLbUpdateHolder(com.hubspot.singularity.DeployProgressLbUpdateHolder) SingularityDeployResult(com.hubspot.singularity.SingularityDeployResult) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) DeployAcceptanceResult(com.hubspot.singularity.DeployAcceptanceResult) SingularityDeployBuilder(com.hubspot.singularity.SingularityDeployBuilder) SingularityDeployProgress(com.hubspot.singularity.SingularityDeployProgress) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) Test(org.junit.jupiter.api.Test)

Example 3 with SingularityPendingDeploy

use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.

the class SingularitySchedulerTestBase method startDeploy.

protected void startDeploy(SingularityDeployMarker deployMarker, long timestamp) {
    SingularityDeployProgress startingDeployProgress = SingularityDeployProgress.forNewDeploy(1, timestamp, false);
    deployManager.savePendingDeploy(new SingularityPendingDeploy(deployMarker, DeployState.WAITING, startingDeployProgress, Optional.empty()));
}
Also used : SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) SingularityDeployProgress(com.hubspot.singularity.SingularityDeployProgress)

Example 4 with SingularityPendingDeploy

use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.

the class RequestResource method submitRequest.

private void submitRequest(SingularityRequest request, Optional<SingularityRequestWithState> oldRequestWithState, Optional<RequestHistoryType> historyType, Optional<Boolean> skipHealthchecks, Optional<String> message, Optional<SingularityBounceRequest> maybeBounceRequest, SingularityUser user) {
    checkNotNullBadRequest(request.getId(), "Request must have an id");
    checkConflict(!requestManager.cleanupRequestExists(request.getId()), "Request %s is currently cleaning. Try again after a few moments", request.getId());
    Optional<SingularityPendingDeploy> maybePendingDeploy = deployManager.getPendingDeploy(request.getId());
    checkConflict(!(maybePendingDeploy.isPresent() && maybePendingDeploy.get().getUpdatedRequest().isPresent()), "Request %s has a pending deploy that may change the request data. Try again when the deploy has finished", request.getId());
    Optional<SingularityRequest> oldRequest = oldRequestWithState.isPresent() ? Optional.of(oldRequestWithState.get().getRequest()) : Optional.<SingularityRequest>empty();
    authorizationHelper.checkForAuthorizedChanges(request, oldRequest, user);
    if (oldRequest.isPresent()) {
        authorizationHelper.checkForAuthorization(oldRequest.get(), user, SingularityAuthorizationScope.WRITE);
        validator.checkActionEnabled(SingularityAction.UPDATE_REQUEST);
    } else {
        validator.checkActionEnabled(SingularityAction.CREATE_REQUEST);
    }
    if (request.getAgentPlacement().isPresent() && (request.getAgentPlacement().get() == AgentPlacement.SPREAD_ALL_SLAVES || request.getAgentPlacement().get() == AgentPlacement.SPREAD_ALL_AGENTS)) {
        checkBadRequest(validator.isSpreadAllAgentsEnabled(), "You must enabled spread to all agents in order to use the SPREAD_ALL_AGENTS request type");
        int currentActiveAgentCount = agentManager.getNumObjectsAtState(MachineState.ACTIVE);
        request = request.toBuilder().setInstances(Optional.of(currentActiveAgentCount)).build();
    }
    if (!oldRequest.isPresent() || !(oldRequest.get().getInstancesSafe() == request.getInstancesSafe())) {
        validator.checkScale(request, Optional.empty());
    }
    authorizationHelper.checkForAuthorization(request, user, SingularityAuthorizationScope.WRITE);
    RequestState requestState = RequestState.ACTIVE;
    if (oldRequestWithState.isPresent()) {
        requestState = oldRequestWithState.get().getState();
    }
    if (oldRequest.isPresent() && request.getInstancesSafe() < oldRequest.get().getInstancesSafe()) {
        // Trigger cleanups for scale down
        int newInstances = request.getInstancesSafe();
        Optional<SingularityRequestDeployState> maybeDeployState = deployManager.getRequestDeployState(request.getId());
        if (maybeDeployState.isPresent() && maybeDeployState.get().getActiveDeploy().isPresent()) {
            List<SingularityTaskId> remainingActiveTasks = new ArrayList<>();
            taskManager.getActiveTaskIdsForDeploy(request.getId(), maybeDeployState.get().getActiveDeploy().get().getDeployId()).forEach(taskId -> {
                if (taskId.getInstanceNo() > newInstances) {
                    taskManager.createTaskCleanup(new SingularityTaskCleanup(Optional.of(user.getId()), TaskCleanupType.SCALING_DOWN, System.currentTimeMillis(), taskId, message, Optional.of(UUID.randomUUID().toString()), Optional.empty()));
                } else {
                    remainingActiveTasks.add(taskId);
                }
            });
            int activeRacksWithCapacityCount = agentAndRackManager.getActiveRacksWithCapacityCount();
            if (oldRequest.get().getInstancesSafe() > activeRacksWithCapacityCount) {
                if (request.isRackSensitive() && configuration.isRebalanceRacksOnScaleDown()) {
                    rebalancingHelper.rebalanceRacks(request, remainingActiveTasks, user.getEmail());
                }
            }
            if (request.getAgentAttributeMinimums().isPresent()) {
                Set<SingularityTaskId> cleanedTasks = rebalancingHelper.rebalanceAttributeDistribution(request, user.getEmail(), remainingActiveTasks);
                remainingActiveTasks.removeAll(cleanedTasks);
            }
        }
    }
    if (oldRequest.isPresent() && !oldRequest.get().getSkipHealthchecks().orElse(false) && request.getSkipHealthchecks().orElse(false)) {
        LOG.info("Marking pending tasks as healthy for skipHealthchecks on {}", request.getId());
        taskManager.getActiveTaskIdsForRequest(request.getId()).forEach(t -> {
            // Will only be saved if async healthchecks have not already finished
            taskManager.saveHealthcheckResult(new SingularityTaskHealthcheckResult(Optional.of(200), Optional.empty(), System.currentTimeMillis(), Optional.of(String.format("Healthchecks skipped by %s", user.getId())), Optional.empty(), t, Optional.empty()));
        });
    }
    requestHelper.updateRequest(request, oldRequest, requestState, historyType, user.getEmail(), skipHealthchecks, message, maybeBounceRequest);
}
Also used : SingularityRequest(com.hubspot.singularity.SingularityRequest) ArrayList(java.util.ArrayList) SingularityRequestDeployState(com.hubspot.singularity.SingularityRequestDeployState) RequestState(com.hubspot.singularity.RequestState) SingularityTaskHealthcheckResult(com.hubspot.singularity.SingularityTaskHealthcheckResult) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) SingularityTaskCleanup(com.hubspot.singularity.SingularityTaskCleanup) SingularityTaskId(com.hubspot.singularity.SingularityTaskId)

Example 5 with SingularityPendingDeploy

use of com.hubspot.singularity.SingularityPendingDeploy in project Singularity by HubSpot.

the class SingularityCrashLoops method getActiveCrashLoops.

List<CrashLoopInfo> getActiveCrashLoops(SingularityDeployStatistics deployStatistics) {
    List<CrashLoopInfo> active = new ArrayList<>();
    if (deployStatistics.getTaskFailureEvents().isEmpty()) {
        return active;
    }
    Optional<SingularityPendingDeploy> maybePending = deployManager.getPendingDeploy(deployStatistics.getRequestId());
    if (maybePending.isPresent() && maybePending.get().getDeployMarker().getDeployId().equals(deployStatistics.getDeployId())) {
        LOG.debug("Not checking cooldown for pending deploy {} - {}", deployStatistics.getRequestId(), deployStatistics.getDeployId());
        return active;
    }
    Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(deployStatistics.getRequestId());
    if (!maybeRequest.isPresent()) {
        return active;
    }
    long now = System.currentTimeMillis();
    // Check fast failures
    Optional<Long> maybeCooldownStart = cooldownStart(deployStatistics, Optional.empty());
    if (maybeCooldownStart.isPresent()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCooldownStart.get(), Optional.empty(), CrashLoopType.FAST_FAILURE_LOOP));
    }
    /*
     * Startup failure loop
     * a) small count of failures but instance num matches one that is in cleaning state waiting for a replacement
     */
    Map<Integer, Long> taskCleanStartTimes = taskManager.getCleanupTasks().stream().filter(t -> t.getTaskId().getRequestId().equals(deployStatistics.getRequestId()) && t.getTaskId().getDeployId().equals(deployStatistics.getDeployId())).collect(Collectors.toMap(t -> t.getTaskId().getInstanceNo(), SingularityTaskCleanup::getTimestamp, Math::max));
    Map<Integer, List<Long>> recentStartupFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.STARTUP_FAILURE && taskCleanStartTimes.containsKey(e.getInstance())).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
    boolean hasStartupFailure = false;
    for (Map.Entry<Integer, List<Long>> entry : recentStartupFailures.entrySet()) {
        if (taskCleanStartTimes.containsKey(entry.getKey())) {
            if (entry.getValue().stream().filter(t -> t > taskCleanStartTimes.get(entry.getKey())).count() > 2) {
                active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), entry.getValue().stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
                hasStartupFailure = true;
                break;
            }
        }
    }
    /*
     * Startup failure loop
     * b) multiple instances failing healthchecks too many times in X minutes
     */
    if (hasStartupFailure) {
        long startupFailThreshold = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateStartupLoopOverMinutes());
        List<Long> recentStartupFailTimestamps = recentStartupFailures.values().stream().flatMap(List::stream).filter(t -> t > startupFailThreshold).collect(Collectors.toList());
        if (recentStartupFailTimestamps.size() > configuration.getStartupFailureThreshold()) {
            active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), recentStartupFailTimestamps.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.STARTUP_FAILURE_LOOP));
        }
    }
    /*
     * OOM Danger. > X OOMs in Y minutes across all instances
     */
    long thresholdOomTime = now - TimeUnit.MINUTES.toMillis(configuration.getEvaluateOomsOverMinutes());
    List<Long> oomFailures = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM && e.getTimestamp() > thresholdOomTime).map(TaskFailureEvent::getTimestamp).collect(Collectors.toList());
    if (oomFailures.size() >= configuration.getOomFailureThreshold()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), oomFailures.stream().min(Comparator.comparingLong(Long::longValue)).get(), Optional.empty(), CrashLoopType.OOM));
    }
    /*
     * Single instance failure. > X failures with same instance no in X minutes, bucketed to avoid counting fast failure as one of these
     * Multi instance failure. > X% of instances failing within Y minutes
     */
    Map<Integer, List<Long>> recentFailuresByInstance = deployStatistics.getTaskFailureEvents().stream().filter(e -> e.getType() == TaskFailureType.OOM || e.getType() == TaskFailureType.BAD_EXIT_CODE || e.getType() == TaskFailureType.OUT_OF_DISK_SPACE).collect(Collectors.groupingBy(TaskFailureEvent::getInstance, Collectors.mapping(TaskFailureEvent::getTimestamp, Collectors.toList())));
    for (Map.Entry<Integer, List<Long>> entry : recentFailuresByInstance.entrySet()) {
        Optional<Long> maybeCrashStart = getStartForFailuresInBuckets(now, entry.getValue(), TimeUnit.MINUTES.toMillis(configuration.getSingleInstanceFailureBucketSizeMinutes()), configuration.getSingleInstanceFailureBuckets(), configuration.getSingleInstanceFailureThreshold(), configuration.getSingleInstanceMinBucketIndexPercent());
        if (maybeCrashStart.isPresent()) {
            active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeCrashStart.get(), Optional.empty(), CrashLoopType.SINGLE_INSTANCE_FAILURE_LOOP));
            break;
        }
    }
    Optional<Long> maybeMultiCrashStart = getStartForFailuresInBuckets(now, recentFailuresByInstance.values().stream().flatMap(List::stream).collect(Collectors.toList()), TimeUnit.MINUTES.toMillis(configuration.getMultiInstanceFailureBucketSizeMinutes()), configuration.getMultiInstanceFailureBuckets(), configuration.getMultiInstanceFailureThreshold(), configuration.getMultiInstanceMinBucketIndexPercent());
    if (recentFailuresByInstance.size() > 1 && maybeMultiCrashStart.isPresent()) {
        active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), maybeMultiCrashStart.get(), Optional.empty(), CrashLoopType.MULTI_INSTANCE_FAILURE));
    }
    if (maybeRequest.get().getRequest().isLongRunning()) {
        /*
       * Slow failures. Occasional failures, count on order of hours, looking for consistency in non-zero count each hour
       */
        getStartForFailuresInBuckets(now, recentFailuresByInstance, TimeUnit.MINUTES.toMillis(configuration.getSlowFailureBucketSizeMinutes()), configuration.getSlowFailureBuckets(), configuration.getSlowFailureThreshold(), configuration.getSlowFailureMinBucketIndexPercent()).ifPresent(start -> active.add(new CrashLoopInfo(deployStatistics.getRequestId(), deployStatistics.getDeployId(), start, Optional.empty(), CrashLoopType.SLOW_FAILURES)));
        getUnexpectedExitLoop(now, deployStatistics).ifPresent(active::add);
    }
    return active;
}
Also used : SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) IntStream(java.util.stream.IntStream) CrashLoopType(com.hubspot.singularity.CrashLoopType) SingularityRequest(com.hubspot.singularity.SingularityRequest) DeployManager(com.hubspot.singularity.data.DeployManager) TaskFailureType(com.hubspot.singularity.TaskFailureType) Inject(com.google.inject.Inject) RequestManager(com.hubspot.singularity.data.RequestManager) LoggerFactory(org.slf4j.LoggerFactory) Singleton(javax.inject.Singleton) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics) ArrayList(java.util.ArrayList) TaskFailureEvent(com.hubspot.singularity.TaskFailureEvent) Map(java.util.Map) TaskManager(com.hubspot.singularity.data.TaskManager) SingularityConfiguration(com.hubspot.singularity.config.SingularityConfiguration) SingularityTaskCleanup(com.hubspot.singularity.SingularityTaskCleanup) CrashLoopConfiguration(com.hubspot.singularity.config.CrashLoopConfiguration) Logger(org.slf4j.Logger) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) Collectors(java.util.stream.Collectors) RequestState(com.hubspot.singularity.RequestState) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) Optional(java.util.Optional) Comparator(java.util.Comparator) CrashLoopInfo(com.hubspot.singularity.CrashLoopInfo) ArrayList(java.util.ArrayList) SingularityPendingDeploy(com.hubspot.singularity.SingularityPendingDeploy) SingularityRequestWithState(com.hubspot.singularity.SingularityRequestWithState) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) TaskFailureEvent(com.hubspot.singularity.TaskFailureEvent)

Aggregations

SingularityPendingDeploy (com.hubspot.singularity.SingularityPendingDeploy)19 SingularityRequestWithState (com.hubspot.singularity.SingularityRequestWithState)13 SingularityRequest (com.hubspot.singularity.SingularityRequest)11 SingularityTaskId (com.hubspot.singularity.SingularityTaskId)10 ArrayList (java.util.ArrayList)10 RequestState (com.hubspot.singularity.RequestState)8 SingularityPendingRequest (com.hubspot.singularity.SingularityPendingRequest)8 SingularityTask (com.hubspot.singularity.SingularityTask)8 List (java.util.List)8 SingularityRequestDeployState (com.hubspot.singularity.SingularityRequestDeployState)7 Optional (java.util.Optional)7 SingularityDeploy (com.hubspot.singularity.SingularityDeploy)6 SingularityDeployKey (com.hubspot.singularity.SingularityDeployKey)6 SingularityDeployProgress (com.hubspot.singularity.SingularityDeployProgress)6 Map (java.util.Map)6 Inject (com.google.inject.Inject)5 RequestType (com.hubspot.singularity.RequestType)5 SingularityDeployMarker (com.hubspot.singularity.SingularityDeployMarker)5 PendingType (com.hubspot.singularity.SingularityPendingRequest.PendingType)5 SingularityTaskCleanup (com.hubspot.singularity.SingularityTaskCleanup)5