Search in sources :

Example 1 with MesosSlaveMetricsSnapshotObject

use of com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject in project Singularity by HubSpot.

the class SingularityUsageTest method itCreatesTaskCleanupsWhenAMachineIsOverloaded.

@Test
public void itCreatesTaskCleanupsWhenAMachineIsOverloaded() {
    try {
        configuration.setShuffleTasksForOverloadedSlaves(true);
        initRequest();
        initFirstDeployWithResources(configuration.getMesosConfiguration().getDefaultCpus(), configuration.getMesosConfiguration().getDefaultMemory());
        saveAndSchedule(requestManager.getRequest(requestId).get().getRequest().toBuilder().setInstances(Optional.of(3)));
        resourceOffers(1);
        SingularitySlaveUsage highUsage = new SingularitySlaveUsage(15, 10, Optional.of(10.0), 1, 1, Optional.of(30L), 1, 1, Optional.of(1024L), Collections.emptyMap(), 1, System.currentTimeMillis(), 1, 30000, 10, 15, 15, 15, 0, 107374182);
        usageManager.saveSpecificSlaveUsageAndSetCurrent("host1", highUsage);
        SingularityTaskId taskId1 = taskManager.getActiveTaskIds().get(0);
        String t1 = taskId1.getId();
        SingularityTaskId taskId2 = taskManager.getActiveTaskIds().get(1);
        String t2 = taskId2.getId();
        SingularityTaskId taskId3 = taskManager.getActiveTaskIds().get(2);
        String t3 = taskId3.getId();
        statusUpdate(taskManager.getTask(taskId1).get(), TaskState.TASK_STARTING, Optional.of(taskId1.getStartedAt()));
        statusUpdate(taskManager.getTask(taskId2).get(), TaskState.TASK_STARTING, Optional.of(taskId2.getStartedAt()));
        statusUpdate(taskManager.getTask(taskId3).get(), TaskState.TASK_STARTING, Optional.of(taskId3.getStartedAt()));
        // task 1 using 3 cpus
        MesosTaskMonitorObject t1u1 = getTaskMonitor(t1, 15, TimeUnit.MILLISECONDS.toSeconds(taskId1.getStartedAt()) + 5, 1024);
        // task 2 using 2 cpus
        MesosTaskMonitorObject t2u1 = getTaskMonitor(t2, 10, TimeUnit.MILLISECONDS.toSeconds(taskId2.getStartedAt()) + 5, 1024);
        // task 3 using 1 cpus
        MesosTaskMonitorObject t3u1 = getTaskMonitor(t3, 5, TimeUnit.MILLISECONDS.toSeconds(taskId3.getStartedAt()) + 5, 1024);
        mesosClient.setSlaveResourceUsage("host1", Arrays.asList(t1u1, t2u1, t3u1));
        mesosClient.setSlaveMetricsSnapshot("host1", new MesosSlaveMetricsSnapshotObject(0, 0, 0, 10.0, 0, 0, 0, 0, 0, 0, 0, 0, 10.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0));
        usagePoller.runActionOnPoll();
        // First task is cleaned up
        Assert.assertEquals(taskManager.getTaskCleanup(taskId1.getId()).get().getCleanupType(), TaskCleanupType.REBALANCE_CPU_USAGE);
        // Second task is not cleaned up because it is from the same request as task 1
        Assert.assertFalse(taskManager.getTaskCleanup(taskId2.getId()).isPresent());
    } finally {
        configuration.setShuffleTasksForOverloadedSlaves(false);
    }
}
Also used : SingularitySlaveUsage(com.hubspot.singularity.SingularitySlaveUsage) MesosSlaveMetricsSnapshotObject(com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) MesosTaskMonitorObject(com.hubspot.mesos.json.MesosTaskMonitorObject) Test(org.junit.Test)

Example 2 with MesosSlaveMetricsSnapshotObject

use of com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject in project Singularity by HubSpot.

the class SingularityUsagePoller method collectSlaveUage.

private void collectSlaveUage(SingularitySlave slave, long now, Map<String, RequestUtilization> utilizationPerRequestId, Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts, AtomicLong totalMemBytesUsed, AtomicLong totalMemBytesAvailable, AtomicDouble totalCpuUsed, AtomicDouble totalCpuAvailable, AtomicLong totalDiskBytesUsed, AtomicLong totalDiskBytesAvailable) {
    Map<ResourceUsageType, Number> longRunningTasksUsage = new HashMap<>();
    longRunningTasksUsage.put(ResourceUsageType.MEMORY_BYTES_USED, 0);
    longRunningTasksUsage.put(ResourceUsageType.CPU_USED, 0);
    longRunningTasksUsage.put(ResourceUsageType.DISK_BYTES_USED, 0);
    Optional<Long> memoryMbTotal = Optional.absent();
    Optional<Double> cpusTotal = Optional.absent();
    Optional<Long> diskMbTotal = Optional.absent();
    long memoryMbReservedOnSlave = 0;
    double cpuReservedOnSlave = 0;
    long diskMbReservedOnSlave = 0;
    long memoryBytesUsedOnSlave = 0;
    double cpusUsedOnSlave = 0;
    long diskMbUsedOnSlave = 0;
    try {
        List<MesosTaskMonitorObject> allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost());
        MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost());
        double systemMemTotalBytes = 0;
        double systemMemFreeBytes = 0;
        double systemLoad1Min = 0;
        double systemLoad5Min = 0;
        double systemLoad15Min = 0;
        double slaveDiskUsed = 0;
        double slaveDiskTotal = 0;
        double systemCpusTotal = 0;
        if (slaveMetricsSnapshot != null) {
            systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes();
            systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes();
            systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min();
            systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min();
            systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min();
            slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed();
            slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal();
            systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal();
        }
        double systemLoad;
        switch(configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
            case LOAD_1:
                systemLoad = systemLoad1Min;
                break;
            case LOAD_15:
                systemLoad = systemLoad15Min;
                break;
            case LOAD_5:
            default:
                systemLoad = systemLoad5Min;
                break;
        }
        boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0;
        List<TaskIdWithUsage> possibleTasksToShuffle = new ArrayList<>();
        for (MesosTaskMonitorObject taskUsage : allTaskUsage) {
            String taskId = taskUsage.getSource();
            SingularityTaskId task;
            try {
                task = SingularityTaskId.valueOf(taskId);
            } catch (InvalidSingularityTaskIdException e) {
                LOG.error("Couldn't get SingularityTaskId for {}", taskUsage);
                continue;
            }
            SingularityTaskUsage latestUsage = getUsage(taskUsage);
            List<SingularityTaskUsage> pastTaskUsages = usageManager.getTaskUsage(taskId);
            clearOldUsage(taskId);
            usageManager.saveSpecificTaskUsage(taskId, latestUsage);
            Optional<SingularityTask> maybeTask = taskManager.getTask(task);
            Optional<Resources> maybeResources = Optional.absent();
            if (maybeTask.isPresent()) {
                maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources());
                if (maybeResources.isPresent()) {
                    Resources taskResources = maybeResources.get();
                    double memoryMbReservedForTask = taskResources.getMemoryMb();
                    double cpuReservedForTask = taskResources.getCpus();
                    double diskMbReservedForTask = taskResources.getDiskMb();
                    memoryMbReservedOnSlave += memoryMbReservedForTask;
                    cpuReservedOnSlave += cpuReservedForTask;
                    diskMbReservedOnSlave += diskMbReservedForTask;
                    updateRequestUtilization(utilizationPerRequestId, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask);
                }
            }
            memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes();
            diskMbUsedOnSlave += latestUsage.getDiskTotalBytes();
            SingularityTaskCurrentUsage currentUsage = null;
            if (pastTaskUsages.isEmpty()) {
                Optional<SingularityTaskHistoryUpdate> maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING);
                if (maybeStartingUpdate.isPresent()) {
                    long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp());
                    double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds);
                    if (isLongRunning(task) || isConsideredLongRunning(task)) {
                        updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), usedCpusSinceStart, latestUsage.getDiskTotalBytes());
                    }
                    currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes());
                    usageManager.saveCurrentTaskUsage(taskId, currentUsage);
                    cpusUsedOnSlave += usedCpusSinceStart;
                }
            } else {
                SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1);
                double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp()));
                if (isLongRunning(task) || isConsideredLongRunning(task)) {
                    updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), taskCpusUsed, latestUsage.getDiskTotalBytes());
                }
                currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes());
                usageManager.saveCurrentTaskUsage(taskId, currentUsage);
                cpusUsedOnSlave += taskCpusUsed;
            }
            if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) {
                if (isLongRunning(task) && !configuration.getDoNotShuffleRequests().contains(task.getRequestId())) {
                    Optional<SingularityTaskHistoryUpdate> maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING);
                    if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) {
                        LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId);
                    } else {
                        if (maybeResources.isPresent()) {
                            possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage));
                        }
                    }
                }
            }
        }
        if (!slave.getResources().isPresent() || !slave.getResources().get().getMemoryMegaBytes().isPresent() || !slave.getResources().get().getNumCpus().isPresent()) {
            LOG.debug("Could not find slave or resources for slave {}", slave.getId());
        } else {
            memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue());
            cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue());
            diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get());
        }
        SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, longRunningTasksUsage, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal);
        if (slaveOverloaded) {
            overLoadedHosts.put(slaveUsage, possibleTasksToShuffle);
        }
        List<Long> slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId());
        if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) {
            usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0));
        }
        if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) {
            totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed());
            totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed());
            totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed());
            totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get());
            totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get());
            totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get());
        }
        LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
        usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
    } catch (Throwable t) {
        String message = String.format("Could not get slave usage for host %s", slave.getHost());
        LOG.error(message, t);
        exceptionNotifier.notify(message, t);
    }
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MesosTaskMonitorObject(com.hubspot.mesos.json.MesosTaskMonitorObject) SingularityTaskHistoryUpdate(com.hubspot.singularity.SingularityTaskHistoryUpdate) ResourceUsageType(com.hubspot.singularity.SingularitySlaveUsage.ResourceUsageType) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) SingularityTaskUsage(com.hubspot.singularity.SingularityTaskUsage) SingularityTaskCurrentUsage(com.hubspot.singularity.SingularityTaskCurrentUsage) SingularitySlaveUsage(com.hubspot.singularity.SingularitySlaveUsage) MesosSlaveMetricsSnapshotObject(com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject) AtomicDouble(com.google.common.util.concurrent.AtomicDouble) InvalidSingularityTaskIdException(com.hubspot.singularity.InvalidSingularityTaskIdException) SingularityTask(com.hubspot.singularity.SingularityTask) AtomicLong(java.util.concurrent.atomic.AtomicLong) Resources(com.hubspot.mesos.Resources)

Example 3 with MesosSlaveMetricsSnapshotObject

use of com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject in project Singularity by HubSpot.

the class SingularityUsageTest method itLimitsTheNumberOfTaskCleanupsToCreate.

@Test
public void itLimitsTheNumberOfTaskCleanupsToCreate() {
    try {
        configuration.setShuffleTasksForOverloadedSlaves(true);
        configuration.setMaxTasksToShuffleTotal(1);
        initRequest();
        initFirstDeployWithResources(configuration.getMesosConfiguration().getDefaultCpus(), configuration.getMesosConfiguration().getDefaultMemory());
        saveAndSchedule(requestManager.getRequest(requestId).get().getRequest().toBuilder().setInstances(Optional.of(3)));
        resourceOffers(1);
        SingularitySlaveUsage highUsage = new SingularitySlaveUsage(15, 10, Optional.of(10.0), 1, 1, Optional.of(30L), 1, 1, Optional.of(1024L), Collections.emptyMap(), 1, System.currentTimeMillis(), 1, 30000, 10, 15, 15, 15, 0, 107374182);
        usageManager.saveSpecificSlaveUsageAndSetCurrent("host1", highUsage);
        SingularityTaskId taskId1 = taskManager.getActiveTaskIds().get(0);
        String t1 = taskId1.getId();
        SingularityTaskId taskId2 = taskManager.getActiveTaskIds().get(1);
        String t2 = taskId2.getId();
        statusUpdate(taskManager.getTask(taskId1).get(), TaskState.TASK_STARTING, Optional.of(taskId1.getStartedAt()));
        statusUpdate(taskManager.getTask(taskId2).get(), TaskState.TASK_STARTING, Optional.of(taskId2.getStartedAt()));
        // task 1 using 3 cpus
        MesosTaskMonitorObject t1u1 = getTaskMonitor(t1, 15, TimeUnit.MILLISECONDS.toSeconds(taskId1.getStartedAt()) + 5, 1024);
        // task 2 using 2 cpus
        MesosTaskMonitorObject t2u1 = getTaskMonitor(t2, 10, TimeUnit.MILLISECONDS.toSeconds(taskId2.getStartedAt()) + 5, 1024);
        mesosClient.setSlaveResourceUsage("host1", Arrays.asList(t1u1, t2u1));
        mesosClient.setSlaveMetricsSnapshot("host1", new MesosSlaveMetricsSnapshotObject(0, 0, 0, 10.0, 0, 0, 0, 0, 0, 0, 0, 0, 10.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0));
        usagePoller.runActionOnPoll();
        // First task is cleaned up
        Assert.assertEquals(taskManager.getTaskCleanup(taskId1.getId()).get().getCleanupType(), TaskCleanupType.REBALANCE_CPU_USAGE);
        // Second task doesn't get cleaned up dur to cluster wide limit
        Assert.assertFalse(taskManager.getTaskCleanup(taskId2.getId()).isPresent());
    } finally {
        configuration.setShuffleTasksForOverloadedSlaves(false);
        configuration.setMaxTasksToShuffleTotal(6);
    }
}
Also used : SingularitySlaveUsage(com.hubspot.singularity.SingularitySlaveUsage) MesosSlaveMetricsSnapshotObject(com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) MesosTaskMonitorObject(com.hubspot.mesos.json.MesosTaskMonitorObject) Test(org.junit.Test)

Aggregations

MesosSlaveMetricsSnapshotObject (com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject)3 MesosTaskMonitorObject (com.hubspot.mesos.json.MesosTaskMonitorObject)3 SingularitySlaveUsage (com.hubspot.singularity.SingularitySlaveUsage)3 SingularityTaskId (com.hubspot.singularity.SingularityTaskId)3 Test (org.junit.Test)2 AtomicDouble (com.google.common.util.concurrent.AtomicDouble)1 Resources (com.hubspot.mesos.Resources)1 InvalidSingularityTaskIdException (com.hubspot.singularity.InvalidSingularityTaskIdException)1 ResourceUsageType (com.hubspot.singularity.SingularitySlaveUsage.ResourceUsageType)1 SingularityTask (com.hubspot.singularity.SingularityTask)1 SingularityTaskCurrentUsage (com.hubspot.singularity.SingularityTaskCurrentUsage)1 SingularityTaskHistoryUpdate (com.hubspot.singularity.SingularityTaskHistoryUpdate)1 SingularityTaskUsage (com.hubspot.singularity.SingularityTaskUsage)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1