Search in sources :

Example 1 with SingularityDisasterDataPoint

use of com.hubspot.singularity.SingularityDisasterDataPoint in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method tooManyLostTasks.

private boolean tooManyLostTasks(long now, List<SingularityDisasterDataPoint> dataPoints) {
    int totalLostTasks = 0;
    for (SingularityDisasterDataPoint dataPoint : dataPoints) {
        if (now - dataPoint.getTimestamp() < disasterConfiguration.getIncludeLostTasksInLastMillis()) {
            totalLostTasks += dataPoint.getNumLostTasks();
        }
    }
    double lostTasksPortion = totalLostTasks / (double) Math.max(dataPoints.get(0).getNumActiveTasks(), 1);
    return lostTasksPortion > disasterConfiguration.getCriticalLostTaskPortion();
}
Also used : SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint)

Example 2 with SingularityDisasterDataPoint

use of com.hubspot.singularity.SingularityDisasterDataPoint in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method tooManyLostSlaves.

private boolean tooManyLostSlaves(long now, List<SingularityDisasterDataPoint> dataPoints) {
    int totalLostSlaves = 0;
    for (SingularityDisasterDataPoint dataPoint : dataPoints) {
        if (now - dataPoint.getTimestamp() < disasterConfiguration.getIncludeLostSlavesInLastMillis()) {
            totalLostSlaves += dataPoint.getNumLostSlaves();
        }
    }
    double lostSlavesPortion = totalLostSlaves / (double) (Math.max(dataPoints.get(0).getNumActiveSlaves() + dataPoints.get(0).getNumLostSlaves(), 1));
    return lostSlavesPortion > disasterConfiguration.getCriticalLostSlavePortion();
}
Also used : SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint)

Example 3 with SingularityDisasterDataPoint

use of com.hubspot.singularity.SingularityDisasterDataPoint in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method runActionOnPoll.

@Override
public void runActionOnPoll() {
    LOG.trace("Starting disaster detection");
    clearExpiredDisabledActions();
    List<SingularityDisasterType> previouslyActiveDisasters = disasterManager.getActiveDisasters();
    List<SingularityDisasterDataPoint> dataPoints = disasterManager.getDisasterStats().getDataPoints();
    SingularityDisasterDataPoint newStats = collectDisasterStats();
    dataPoints.add(0, newStats);
    if (dataPoints.size() > disasterConfiguration.getStatsHistorySize()) {
        dataPoints.remove(dataPoints.size() - 1);
    }
    LOG.debug("Collected new disaster detection dataPoints: {}", newStats);
    List<SingularityDisasterType> newActiveDisasters = checkDataPoints(dataPoints);
    if (!newActiveDisasters.isEmpty()) {
        LOG.warn("Detected new active disasters: {}", newActiveDisasters);
    }
    disasterManager.updateActiveDisasters(previouslyActiveDisasters, newActiveDisasters);
    disasterManager.saveDisasterStats(new SingularityDisasterDataPoints(dataPoints));
    if (!newActiveDisasters.isEmpty()) {
        if (!disasterManager.isAutomatedDisabledActionsDisabled()) {
            disasterManager.addDisabledActionsForDisasters(newActiveDisasters);
        }
        if (!previouslyActiveDisasters.containsAll(newActiveDisasters)) {
            queueDisasterEmail(dataPoints, newActiveDisasters);
        }
    } else {
        disasterManager.clearSystemGeneratedDisabledActions();
    }
}
Also used : SingularityDisasterDataPoints(com.hubspot.singularity.SingularityDisasterDataPoints) SingularityDisasterType(com.hubspot.singularity.SingularityDisasterType) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint)

Example 4 with SingularityDisasterDataPoint

use of com.hubspot.singularity.SingularityDisasterDataPoint in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method collectDisasterStats.

private SingularityDisasterDataPoint collectDisasterStats() {
    long now = System.currentTimeMillis();
    int numActiveTasks = taskManager.getNumActiveTasks();
    List<SingularityPendingTaskId> pendingTasks = taskManager.getPendingTaskIds();
    int numPendingTasks = pendingTasks.size();
    int numLateTasks = 0;
    long totalTaskLagMillis = 0;
    int numPastDueTasks = 0;
    for (SingularityPendingTaskId pendingTask : pendingTasks) {
        long taskLagMillis = now - pendingTask.getNextRunAt();
        if (taskLagMillis > 0) {
            numPastDueTasks++;
            totalTaskLagMillis += taskLagMillis;
            if (taskLagMillis > configuration.getDeltaAfterWhichTasksAreLateMillis()) {
                numLateTasks++;
            }
        }
    }
    long avgTaskLagMillis = totalTaskLagMillis / Math.max(numPastDueTasks, 1);
    List<SingularitySlave> slaves = slaveManager.getObjects();
    int numRunningSlaves = 0;
    for (SingularitySlave slave : slaves) {
        if (slave.getCurrentState().getState() != MachineState.DEAD && slave.getCurrentState().getState() != MachineState.MISSING_ON_STARTUP) {
            numRunningSlaves++;
        }
    }
    int numLostSlaves = activeSlavesLost.getAndSet(0);
    int numLostTasks = 0;
    for (Reason lostTaskReason : disasterConfiguration.getLostTaskReasons()) {
        numLostTasks += taskLostReasons.count(lostTaskReason);
    }
    taskLostReasons.clear();
    return new SingularityDisasterDataPoint(now, numActiveTasks, numPendingTasks, numLateTasks, avgTaskLagMillis, numLostTasks, numRunningSlaves, numLostSlaves);
}
Also used : SingularitySlave(com.hubspot.singularity.SingularitySlave) SingularityPendingTaskId(com.hubspot.singularity.SingularityPendingTaskId) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) Reason(org.apache.mesos.v1.Protos.TaskStatus.Reason)

Example 5 with SingularityDisasterDataPoint

use of com.hubspot.singularity.SingularityDisasterDataPoint in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method tooMuchTaskLag.

private boolean tooMuchTaskLag(long now, List<SingularityDisasterDataPoint> dataPoints) {
    Optional<Long> criticalAvgLagTriggeredSince = Optional.absent();
    Optional<Long> warningAvgLagTriggeredSince = Optional.absent();
    Optional<Long> criticalPortionTriggeredSince = Optional.absent();
    Optional<Long> warningPortionTriggeredSince = Optional.absent();
    for (SingularityDisasterDataPoint dataPoint : dataPoints) {
        double overdueTaskPortion = dataPoint.getNumLateTasks() / (double) Math.max((dataPoint.getNumActiveTasks() + dataPoint.getNumPendingTasks()), 1);
        boolean criticalOverdueTasksPortion = overdueTaskPortion > disasterConfiguration.getCriticalOverdueTaskPortion();
        boolean warningOverdueTasksPortion = overdueTaskPortion > disasterConfiguration.getWarningOverdueTaskPortion();
        boolean criticalAvgTaskLag = dataPoint.getAvgTaskLagMillis() > disasterConfiguration.getCriticalAvgTaskLagMillis() && warningOverdueTasksPortion;
        boolean warningAvgTaskLag = dataPoint.getAvgTaskLagMillis() > disasterConfiguration.getWarningAvgTaskLagMillis();
        if (criticalOverdueTasksPortion) {
            criticalPortionTriggeredSince = Optional.of(dataPoint.getTimestamp());
        }
        if (warningOverdueTasksPortion) {
            warningPortionTriggeredSince = Optional.of(dataPoint.getTimestamp());
        }
        if (criticalAvgTaskLag) {
            criticalAvgLagTriggeredSince = Optional.of(dataPoint.getTimestamp());
        }
        if (warningAvgTaskLag) {
            warningAvgLagTriggeredSince = Optional.of(dataPoint.getTimestamp());
        }
        if (!criticalOverdueTasksPortion && !warningOverdueTasksPortion && !criticalAvgTaskLag && !warningAvgTaskLag) {
            break;
        }
    }
    // 'true' if either critical condition is met
    if ((criticalAvgLagTriggeredSince.isPresent() && now - criticalAvgLagTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold()) || (criticalPortionTriggeredSince.isPresent() && now - criticalPortionTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold())) {
        return true;
    }
    // 'true' if both warning conditions are met
    return warningAvgLagTriggeredSince.isPresent() && now - warningAvgLagTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold() && warningPortionTriggeredSince.isPresent() && now - warningPortionTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold();
}
Also used : SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint)

Aggregations

SingularityDisasterDataPoint (com.hubspot.singularity.SingularityDisasterDataPoint)5 SingularityDisasterDataPoints (com.hubspot.singularity.SingularityDisasterDataPoints)1 SingularityDisasterType (com.hubspot.singularity.SingularityDisasterType)1 SingularityPendingTaskId (com.hubspot.singularity.SingularityPendingTaskId)1 SingularitySlave (com.hubspot.singularity.SingularitySlave)1 Reason (org.apache.mesos.v1.Protos.TaskStatus.Reason)1