Search in sources :

Example 16 with SingularityTaskHistoryUpdate

use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.

the class SingularityScheduler method handleCompletedTask.

@Timed
public void handleCompletedTask(Optional<SingularityTask> task, SingularityTaskId taskId, boolean wasActive, long timestamp, ExtendedTaskState state, SingularityCreateResult taskHistoryUpdateCreateResult, Protos.TaskStatus status) {
    final SingularityDeployStatistics deployStatistics = getDeployStatistics(taskId.getRequestId(), taskId.getDeployId());
    if (wasActive) {
        taskManager.deleteActiveTask(taskId.getId());
    }
    if (!task.isPresent() || task.get().getTaskRequest().getRequest().isLoadBalanced()) {
        taskManager.createLBCleanupTask(taskId);
    }
    if (requestManager.isBouncing(taskId.getRequestId())) {
        List<SingularityTaskId> activeTaskIds = taskManager.getActiveTaskIdsForRequest(taskId.getRequestId());
        boolean foundBouncingTask = false;
        for (SingularityTaskId activeTaskId : activeTaskIds) {
            Optional<SingularityTaskHistoryUpdate> maybeCleaningUpdate = taskManager.getTaskHistoryUpdate(activeTaskId, ExtendedTaskState.TASK_CLEANING);
            if (maybeCleaningUpdate.isPresent()) {
                if (maybeCleaningUpdate.get().getStatusReason().or("").contains("BOUNCE")) {
                    // TaskCleanupType enum is included in status message
                    LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
                    foundBouncingTask = true;
                    break;
                } else if (!maybeCleaningUpdate.get().getPrevious().isEmpty()) {
                    for (SingularityTaskHistoryUpdate previousUpdate : maybeCleaningUpdate.get().getPrevious()) {
                        if (previousUpdate.getStatusMessage().or("").contains("BOUNCE")) {
                            LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
                            foundBouncingTask = true;
                            break;
                        }
                    }
                }
            }
        }
        if (!foundBouncingTask) {
            LOG.info("Bounce completed for request {}, no cleaning tasks due to bounce found", taskId.getRequestId());
            Optional<SingularityExpiringBounce> expiringBounce = requestManager.getExpiringBounce(taskId.getRequestId());
            if (expiringBounce.isPresent() && expiringBounce.get().getDeployId().equals(taskId.getDeployId())) {
                requestManager.deleteExpiringObject(SingularityExpiringBounce.class, taskId.getRequestId());
            }
            requestManager.markBounceComplete(taskId.getRequestId());
        }
    }
    final Optional<PendingType> scheduleResult = handleCompletedTaskWithStatistics(task, taskId, timestamp, state, deployStatistics, taskHistoryUpdateCreateResult, status);
    if (taskHistoryUpdateCreateResult == SingularityCreateResult.EXISTED) {
        return;
    }
    updateDeployStatistics(deployStatistics, taskId, task, timestamp, state, scheduleResult);
}
Also used : PendingType(com.hubspot.singularity.SingularityPendingRequest.PendingType) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate) SingularityExpiringBounce(com.hubspot.singularity.expiring.SingularityExpiringBounce) SingularityDeployStatistics(com.hubspot.singularity.SingularityDeployStatistics) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) Timed(com.codahale.metrics.annotation.Timed)

Example 17 with SingularityTaskHistoryUpdate

use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.

the class SingularityUsagePoller method collectSlaveUage.

private void collectSlaveUage(SingularitySlave slave, long now, Map<String, RequestUtilization> utilizationPerRequestId, Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts, AtomicLong totalMemBytesUsed, AtomicLong totalMemBytesAvailable, AtomicDouble totalCpuUsed, AtomicDouble totalCpuAvailable, AtomicLong totalDiskBytesUsed, AtomicLong totalDiskBytesAvailable) {
    Map<ResourceUsageType, Number> longRunningTasksUsage = new HashMap<>();
    longRunningTasksUsage.put(ResourceUsageType.MEMORY_BYTES_USED, 0);
    longRunningTasksUsage.put(ResourceUsageType.CPU_USED, 0);
    longRunningTasksUsage.put(ResourceUsageType.DISK_BYTES_USED, 0);
    Optional<Long> memoryMbTotal = Optional.absent();
    Optional<Double> cpusTotal = Optional.absent();
    Optional<Long> diskMbTotal = Optional.absent();
    long memoryMbReservedOnSlave = 0;
    double cpuReservedOnSlave = 0;
    long diskMbReservedOnSlave = 0;
    long memoryBytesUsedOnSlave = 0;
    double cpusUsedOnSlave = 0;
    long diskMbUsedOnSlave = 0;
    try {
        List<MesosTaskMonitorObject> allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost());
        MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost());
        double systemMemTotalBytes = 0;
        double systemMemFreeBytes = 0;
        double systemLoad1Min = 0;
        double systemLoad5Min = 0;
        double systemLoad15Min = 0;
        double slaveDiskUsed = 0;
        double slaveDiskTotal = 0;
        double systemCpusTotal = 0;
        if (slaveMetricsSnapshot != null) {
            systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes();
            systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes();
            systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min();
            systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min();
            systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min();
            slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed();
            slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal();
            systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal();
        }
        double systemLoad;
        switch(configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
            case LOAD_1:
                systemLoad = systemLoad1Min;
                break;
            case LOAD_15:
                systemLoad = systemLoad15Min;
                break;
            case LOAD_5:
            default:
                systemLoad = systemLoad5Min;
                break;
        }
        boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0;
        List<TaskIdWithUsage> possibleTasksToShuffle = new ArrayList<>();
        for (MesosTaskMonitorObject taskUsage : allTaskUsage) {
            String taskId = taskUsage.getSource();
            SingularityTaskId task;
            try {
                task = SingularityTaskId.valueOf(taskId);
            } catch (InvalidSingularityTaskIdException e) {
                LOG.error("Couldn't get SingularityTaskId for {}", taskUsage);
                continue;
            }
            SingularityTaskUsage latestUsage = getUsage(taskUsage);
            List<SingularityTaskUsage> pastTaskUsages = usageManager.getTaskUsage(taskId);
            clearOldUsage(taskId);
            usageManager.saveSpecificTaskUsage(taskId, latestUsage);
            Optional<SingularityTask> maybeTask = taskManager.getTask(task);
            Optional<Resources> maybeResources = Optional.absent();
            if (maybeTask.isPresent()) {
                maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources());
                if (maybeResources.isPresent()) {
                    Resources taskResources = maybeResources.get();
                    double memoryMbReservedForTask = taskResources.getMemoryMb();
                    double cpuReservedForTask = taskResources.getCpus();
                    double diskMbReservedForTask = taskResources.getDiskMb();
                    memoryMbReservedOnSlave += memoryMbReservedForTask;
                    cpuReservedOnSlave += cpuReservedForTask;
                    diskMbReservedOnSlave += diskMbReservedForTask;
                    updateRequestUtilization(utilizationPerRequestId, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask);
                }
            }
            memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes();
            diskMbUsedOnSlave += latestUsage.getDiskTotalBytes();
            SingularityTaskCurrentUsage currentUsage = null;
            if (pastTaskUsages.isEmpty()) {
                Optional<SingularityTaskHistoryUpdate> maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING);
                if (maybeStartingUpdate.isPresent()) {
                    long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp());
                    double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds);
                    if (isLongRunning(task) || isConsideredLongRunning(task)) {
                        updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), usedCpusSinceStart, latestUsage.getDiskTotalBytes());
                    }
                    currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes());
                    usageManager.saveCurrentTaskUsage(taskId, currentUsage);
                    cpusUsedOnSlave += usedCpusSinceStart;
                }
            } else {
                SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1);
                double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp()));
                if (isLongRunning(task) || isConsideredLongRunning(task)) {
                    updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), taskCpusUsed, latestUsage.getDiskTotalBytes());
                }
                currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes());
                usageManager.saveCurrentTaskUsage(taskId, currentUsage);
                cpusUsedOnSlave += taskCpusUsed;
            }
            if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) {
                if (isLongRunning(task) && !configuration.getDoNotShuffleRequests().contains(task.getRequestId())) {
                    Optional<SingularityTaskHistoryUpdate> maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING);
                    if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) {
                        LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId);
                    } else {
                        if (maybeResources.isPresent()) {
                            possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage));
                        }
                    }
                }
            }
        }
        if (!slave.getResources().isPresent() || !slave.getResources().get().getMemoryMegaBytes().isPresent() || !slave.getResources().get().getNumCpus().isPresent()) {
            LOG.debug("Could not find slave or resources for slave {}", slave.getId());
        } else {
            memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue());
            cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue());
            diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get());
        }
        SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, longRunningTasksUsage, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal);
        if (slaveOverloaded) {
            overLoadedHosts.put(slaveUsage, possibleTasksToShuffle);
        }
        List<Long> slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId());
        if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) {
            usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0));
        }
        if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) {
            totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed());
            totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed());
            totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed());
            totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get());
            totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get());
            totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get());
        }
        LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
        usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
    } catch (Throwable t) {
        String message = String.format("Could not get slave usage for host %s", slave.getHost());
        LOG.error(message, t);
        exceptionNotifier.notify(message, t);
    }
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MesosTaskMonitorObject(com.hubspot.mesos.json.MesosTaskMonitorObject) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate) ResourceUsageType(com.hubspot.singularity.SingularitySlaveUsage.ResourceUsageType) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) SingularityTaskUsage(com.hubspot.singularity.SingularityTaskUsage) SingularityTaskCurrentUsage(com.hubspot.singularity.SingularityTaskCurrentUsage) SingularitySlaveUsage(com.hubspot.singularity.SingularitySlaveUsage) MesosSlaveMetricsSnapshotObject(com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject) AtomicDouble(com.google.common.util.concurrent.AtomicDouble) InvalidSingularityTaskIdException(com.hubspot.singularity.InvalidSingularityTaskIdException) SingularityTask(com.hubspot.singularity.SingularityTask) AtomicLong(java.util.concurrent.atomic.AtomicLong) Resources(com.hubspot.mesos.Resources)

Example 18 with SingularityTaskHistoryUpdate

use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.

the class SingularityDeployHealthHelper method getNoHealthcheckHealthyTasks.

private List<SingularityTaskId> getNoHealthcheckHealthyTasks(final Optional<SingularityDeploy> deploy, final Collection<SingularityTaskId> matchingActiveTasks) {
    final Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);
    final List<SingularityTaskId> healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());
    for (SingularityTaskId taskId : matchingActiveTasks) {
        Collection<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
        SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);
        if (currentState == SimplifiedTaskState.RUNNING && isRunningTaskHealthy(deploy, updates, taskId)) {
            healthyTaskIds.add(taskId);
        }
    }
    return healthyTaskIds;
}
Also used : SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate) ArrayList(java.util.ArrayList) List(java.util.List) SimplifiedTaskState(com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState) SingularityTaskId(com.hubspot.singularity.SingularityTaskId)

Example 19 with SingularityTaskHistoryUpdate

use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.

the class SingularityDeployHealthHelper method getNonHealthcheckedTaskFailure.

private Optional<SingularityDeployFailure> getNonHealthcheckedTaskFailure(Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates, SingularityTaskId taskId) {
    List<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
    if (lastUpdate.getTaskState().isSuccess()) {
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId), Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
    } else if (lastUpdate.getTaskState().isDone()) {
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
    } else if (SingularityTaskHistoryUpdate.getCurrentState(updates) == SimplifiedTaskState.WAITING) {
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_NEVER_ENTERED_RUNNING, Optional.of(taskId), Optional.of(String.format("Task never entered running state, last state was %s (%s)", lastUpdate.getTaskState().getDisplayName(), lastUpdate.getStatusMessage().or("")))));
    }
    return Optional.absent();
}
Also used : SingularityDeployFailure(com.hubspot.singularity.SingularityDeployFailure) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate)

Example 20 with SingularityTaskHistoryUpdate

use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.

the class SingularityDeployHealthHelper method getHealthcheckedTaskFailure.

private Optional<SingularityDeployFailure> getHealthcheckedTaskFailure(SingularityDeploy deploy, Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates, SingularityTaskHealthcheckResult healthcheckResult, SingularityTaskId taskId) {
    Collection<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
    if (!healthcheckResult.isFailed()) {
        return Optional.absent();
    }
    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
    if (lastUpdate.getTaskState().isDone()) {
        if (lastUpdate.getTaskState().isSuccess()) {
            return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId), Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
        } else {
            return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
        }
    }
    final Optional<Integer> healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : configuration.getHealthcheckMaxRetries();
    if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
        String message = String.format("Instance %s failed %s healthchecks, the max for the deploy.", taskId.getInstanceNo(), healthcheckMaxRetries.get() + 1);
        if (healthcheckResult.getStatusCode().isPresent()) {
            message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
        }
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
    }
    Optional<Long> runningAt = getRunningAt(updates);
    if (runningAt.isPresent()) {
        final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
        if (healthcheckResult.isStartup() && deploy.getHealthcheck().isPresent() && durationSinceRunning > deploy.getHealthcheck().get().getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds())) {
            String message = String.format("Instance %s has not responded to healthchecks after running for %s", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
            return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
        }
        if (isRunningLongerThanThreshold(deploy, durationSinceRunning)) {
            String message = String.format("Instance %s has been running for %s and has yet to pass healthchecks.", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
            if (healthcheckResult.getStatusCode().isPresent()) {
                message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
            }
            return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
        }
    }
    return Optional.absent();
}
Also used : SingularityDeployFailure(com.hubspot.singularity.SingularityDeployFailure) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate)

Aggregations

SingularityTaskHistoryUpdate (com.hubspot.singularity.SingularityTaskHistoryUpdate)23 SingularityTaskId (com.hubspot.singularity.SingularityTaskId)12 SingularityTask (com.hubspot.singularity.SingularityTask)7 Test (org.junit.Test)5 ArrayList (java.util.ArrayList)4 SimplifiedTaskState (com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState)3 SingularityTaskStatusHolder (com.hubspot.singularity.SingularityTaskStatusHolder)3 SingularityBounceRequest (com.hubspot.singularity.api.SingularityBounceRequest)3 List (java.util.List)3 Timed (com.codahale.metrics.annotation.Timed)2 SingularityDeployFailure (com.hubspot.singularity.SingularityDeployFailure)2 SingularityTaskIdHistory (com.hubspot.singularity.SingularityTaskIdHistory)2 SingularityTaskMetadata (com.hubspot.singularity.SingularityTaskMetadata)2 AtomicDouble (com.google.common.util.concurrent.AtomicDouble)1 Resources (com.hubspot.mesos.Resources)1 MesosSlaveMetricsSnapshotObject (com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject)1 MesosTaskMonitorObject (com.hubspot.mesos.json.MesosTaskMonitorObject)1 MesosTaskStatusObject (com.hubspot.mesos.protos.MesosTaskStatusObject)1 ExtendedTaskState (com.hubspot.singularity.ExtendedTaskState)1 InvalidSingularityTaskIdException (com.hubspot.singularity.InvalidSingularityTaskIdException)1