Search in sources :

Example 1 with SingularityAgent

use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.

the class SingularityDisasterDetectionPoller method collectDisasterStats.

private SingularityDisasterDataPoint collectDisasterStats() {
    long now = System.currentTimeMillis();
    int numActiveTasks = taskManager.getNumActiveTasks();
    List<SingularityPendingTaskId> pendingTasks = taskManager.getPendingTaskIds();
    int numPendingTasks = pendingTasks.size();
    int numLateTasks = 0;
    long totalTaskLagMillis = 0;
    int numPastDueTasks = 0;
    for (SingularityPendingTaskId pendingTask : pendingTasks) {
        long taskLagMillis = now - pendingTask.getNextRunAt();
        if (taskLagMillis > 0) {
            numPastDueTasks++;
            totalTaskLagMillis += taskLagMillis;
            if (taskLagMillis > configuration.getDeltaAfterWhichTasksAreLateMillis()) {
                numLateTasks++;
            }
        }
    }
    long avgTaskLagMillis = totalTaskLagMillis / Math.max(numPastDueTasks, 1);
    List<SingularityAgent> slaves = agentManager.getObjects();
    int numRunningSlaves = 0;
    for (SingularityAgent slave : slaves) {
        if (slave.getCurrentState().getState() != MachineState.DEAD && slave.getCurrentState().getState() != MachineState.MISSING_ON_STARTUP) {
            numRunningSlaves++;
        }
    }
    int numLostSlaves = activeSlavesLost.getAndSet(0);
    int numLostTasks = 0;
    for (Reason lostTaskReason : disasterConfiguration.getLostTaskReasons()) {
        numLostTasks += taskLostReasons.count(lostTaskReason);
    }
    taskLostReasons.clear();
    return new SingularityDisasterDataPoint(now, numActiveTasks, numPendingTasks, numLateTasks, avgTaskLagMillis, numLostTasks, numRunningSlaves, numLostSlaves);
}
Also used : SingularityPendingTaskId(com.hubspot.singularity.SingularityPendingTaskId) SingularityAgent(com.hubspot.singularity.SingularityAgent) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) SingularityDisasterDataPoint(com.hubspot.singularity.SingularityDisasterDataPoint) Reason(org.apache.mesos.v1.Protos.TaskStatus.Reason)

Example 2 with SingularityAgent

use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.

the class SingularityAgentReconciliationPoller method checkInactiveAgents.

private void checkInactiveAgents() {
    final long start = System.currentTimeMillis();
    // filter dead and missing on startup agents for cleanup
    List<SingularityAgent> deadAgents = agentManager.getObjectsFiltered(MachineState.DEAD);
    LOG.debug("Found {} dead agents", deadAgents.size());
    List<SingularityAgent> missingOnStartupAgents = agentManager.getObjectsFiltered(MachineState.MISSING_ON_STARTUP);
    LOG.debug("Found {} agents missing on startup", missingOnStartupAgents.size());
    List<SingularityAgent> inactiveAgents = new ArrayList<>();
    inactiveAgents.addAll(deadAgents);
    inactiveAgents.addAll(missingOnStartupAgents);
    if (inactiveAgents.isEmpty()) {
        LOG.trace("No inactive agents");
        return;
    }
    int deleted = 0;
    final long maxDuration = TimeUnit.HOURS.toMillis(configuration.getDeleteDeadAgentsAfterHours());
    for (SingularityAgent inactiveAgent : inactiveAgents) {
        final long duration = System.currentTimeMillis() - inactiveAgent.getCurrentState().getTimestamp();
        if (duration > maxDuration) {
            SingularityDeleteResult result = agentManager.deleteObject(inactiveAgent.getId());
            // delete agent from inactive list too
            inactiveAgentManager.cleanInactiveAgent(inactiveAgent.getHost());
            deleted++;
            LOG.info("Removing inactive agent {} ({}) after {} (max {})", inactiveAgent.getId(), result, JavaUtils.durationFromMillis(duration), JavaUtils.durationFromMillis(maxDuration));
        }
    }
    LOG.debug("Checked {} inactive agents, deleted {} in {}", inactiveAgents.size(), deleted, JavaUtils.duration(start));
}
Also used : ArrayList(java.util.ArrayList) SingularityAgent(com.hubspot.singularity.SingularityAgent) SingularityDeleteResult(com.hubspot.singularity.SingularityDeleteResult)

Example 3 with SingularityAgent

use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.

the class SingularityAgentAndRackManager method checkRackAfterAgentLoss.

private void checkRackAfterAgentLoss(SingularityAgent lostAgent) {
    List<SingularityAgent> agents = agentManager.getObjectsFiltered(MachineState.ACTIVE);
    int numInRack = 0;
    for (SingularityAgent agent : agents) {
        if (agent.getRackId().equals(lostAgent.getRackId())) {
            numInRack++;
        }
    }
    LOG.info("Found {} agents left in rack {}", numInRack, lostAgent.getRackId());
    if (numInRack == 0) {
        rackManager.changeState(lostAgent.getRackId(), MachineState.DEAD, Optional.empty(), Optional.empty());
    }
}
Also used : SingularityAgent(com.hubspot.singularity.SingularityAgent)

Example 4 with SingularityAgent

use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.

the class SingularityAgentAndRackManager method checkDecommissionedAgentsFromMaster.

public void checkDecommissionedAgentsFromMaster(MesosMasterStateObject state, boolean isStartup) {
    Map<String, SingularityAgent> agentsById = agentManager.getObjectsByIdForState(MachineState.DECOMMISSIONED);
    for (MesosMasterAgentObject agentJsonObject : state.getAgents()) {
        String agentId = agentJsonObject.getId();
        agentsById.remove(agentId);
    }
    for (SingularityAgent leftOverAgent : agentsById.values()) {
        MachineState newState = isStartup ? MachineState.MISSING_ON_STARTUP : MachineState.DEAD;
        LOG.info("Marking decommissioned agent without mesos resources as {}", newState);
        agentManager.changeState(leftOverAgent, newState, Optional.empty(), Optional.empty());
    }
}
Also used : SingularityAgent(com.hubspot.singularity.SingularityAgent) MesosMasterAgentObject(com.hubspot.mesos.json.MesosMasterAgentObject) MachineState(com.hubspot.singularity.MachineState)

Example 5 with SingularityAgent

use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.

the class SingularityAgentAndRackManager method doesOfferMatch.

AgentMatchState doesOfferMatch(SingularityOfferHolder offerHolder, SingularityTaskRequest taskRequest, List<SingularityTaskId> activeTaskIdsForRequest, boolean isPreemptibleTask, RequestUtilization requestUtilization) {
    final String host = offerHolder.getHostname();
    final String rackId = offerHolder.getRackId();
    final String agentId = offerHolder.getAgentId();
    Optional<SingularityAgent> maybeAgent = agentManager.getObject(agentId);
    if (!maybeAgent.isPresent()) {
        return AgentMatchState.RESOURCES_DO_NOT_MATCH;
    }
    final MachineState currentState = maybeAgent.get().getCurrentState().getState();
    if (currentState == MachineState.FROZEN) {
        return AgentMatchState.AGENT_FROZEN;
    }
    if (currentState.isDecommissioning()) {
        return AgentMatchState.AGENT_DECOMMISSIONING;
    }
    final MachineState currentRackState = rackManager.getObject(rackId).get().getCurrentState().getState();
    if (currentRackState == MachineState.FROZEN) {
        return AgentMatchState.RACK_FROZEN;
    }
    if (currentRackState.isDecommissioning()) {
        return AgentMatchState.RACK_DECOMMISSIONING;
    }
    if (!taskRequest.getRequest().getRackAffinity().orElse(Collections.emptyList()).isEmpty()) {
        if (!taskRequest.getRequest().getRackAffinity().get().contains(rackId)) {
            LOG.trace("Task {} requires a rack in {} (current rack {})", taskRequest.getPendingTask().getPendingTaskId(), taskRequest.getRequest().getRackAffinity().get(), rackId);
            return AgentMatchState.RACK_AFFINITY_NOT_MATCHING;
        }
    }
    if (!isAttributesMatch(offerHolder, taskRequest, isPreemptibleTask)) {
        return AgentMatchState.AGENT_ATTRIBUTES_DO_NOT_MATCH;
    } else if (!areAttributeMinimumsFeasible(offerHolder, taskRequest, activeTaskIdsForRequest)) {
        return AgentMatchState.AGENT_ATTRIBUTES_DO_NOT_MATCH;
    }
    final AgentPlacement agentPlacement = maybeOverrideAgentPlacement(taskRequest.getRequest().getAgentPlacement().orElse(configuration.getDefaultAgentPlacement()));
    if (!taskRequest.getRequest().isRackSensitive() && agentPlacement == AgentPlacement.GREEDY) {
        // todo: account for this or let this behavior continue?
        return AgentMatchState.NOT_RACK_OR_AGENT_PARTICULAR;
    }
    final int numDesiredInstances = taskRequest.getRequest().getInstancesSafe();
    boolean allowBounceToSameHost = isAllowBounceToSameHost(taskRequest.getRequest());
    int activeRacksWithCapacityCount = getActiveRacksWithCapacityCount();
    Multiset<String> countPerRack = HashMultiset.create(activeRacksWithCapacityCount);
    double numOnAgent = 0;
    double numCleaningOnAgent = 0;
    double numFromSameBounceOnAgent = 0;
    double numOtherDeploysOnAgent = 0;
    boolean taskLaunchedFromBounceWithActionId = taskRequest.getPendingTask().getPendingTaskId().getPendingType() == PendingType.BOUNCE && taskRequest.getPendingTask().getActionId().isPresent();
    final String sanitizedHost = offerHolder.getSanitizedHost();
    final String sanitizedRackId = offerHolder.getSanitizedRackId();
    Collection<SingularityTaskId> cleaningTasks = leaderCache.getCleanupTaskIds();
    for (SingularityTaskId taskId : activeTaskIdsForRequest) {
        if (!cleaningTasks.contains(taskId) && !taskManager.isKilledTask(taskId) && taskRequest.getDeploy().getId().equals(taskId.getDeployId())) {
            countPerRack.add(taskId.getSanitizedRackId());
        }
        if (!taskId.getSanitizedHost().equals(sanitizedHost)) {
            continue;
        }
        if (taskRequest.getDeploy().getId().equals(taskId.getDeployId())) {
            if (cleaningTasks.contains(taskId)) {
                numCleaningOnAgent++;
            } else {
                numOnAgent++;
            }
            if (taskLaunchedFromBounceWithActionId) {
                Optional<SingularityTask> maybeTask = taskManager.getTask(taskId);
                boolean errorInTaskData = false;
                if (maybeTask.isPresent()) {
                    SingularityPendingTask pendingTask = maybeTask.get().getTaskRequest().getPendingTask();
                    if (pendingTask.getPendingTaskId().getPendingType() == PendingType.BOUNCE) {
                        if (pendingTask.getActionId().isPresent()) {
                            if (pendingTask.getActionId().get().equals(taskRequest.getPendingTask().getActionId().get())) {
                                numFromSameBounceOnAgent++;
                            }
                        } else {
                            // No actionId present on bounce, fall back to more restrictive placement strategy
                            errorInTaskData = true;
                        }
                    }
                } else {
                    // Could not find appropriate task data, fall back to more restrictive placement strategy
                    errorInTaskData = true;
                }
                if (errorInTaskData) {
                    allowBounceToSameHost = false;
                }
            }
        } else {
            numOtherDeploysOnAgent++;
        }
    }
    if (overrides.isAllowRackSensitivity() && taskRequest.getRequest().isRackSensitive()) {
        final boolean isRackOk = isRackOk(countPerRack, sanitizedRackId, numDesiredInstances, taskRequest.getRequest().getId(), agentId, host, numCleaningOnAgent);
        if (!isRackOk) {
            return AgentMatchState.RACK_SATURATED;
        }
    }
    switch(agentPlacement) {
        case SEPARATE:
        case SEPARATE_BY_DEPLOY:
        case SPREAD_ALL_SLAVES:
        case SPREAD_ALL_AGENTS:
            if (allowBounceToSameHost && taskLaunchedFromBounceWithActionId) {
                if (numFromSameBounceOnAgent > 0) {
                    LOG.trace("Rejecting SEPARATE task {} from agent {} ({}) due to numFromSameBounceOnAgent {}", taskRequest.getRequest().getId(), agentId, host, numFromSameBounceOnAgent);
                    return AgentMatchState.AGENT_SATURATED;
                }
            } else {
                if (numOnAgent > 0 || numCleaningOnAgent > 0) {
                    LOG.trace("Rejecting {} task {} from agent {} ({}) due to numOnAgent {} numCleaningOnAgent {}", agentPlacement.name(), taskRequest.getRequest().getId(), agentId, host, numOnAgent, numCleaningOnAgent);
                    return AgentMatchState.AGENT_SATURATED;
                }
            }
            break;
        case SEPARATE_BY_REQUEST:
            if (numOnAgent > 0 || numCleaningOnAgent > 0 || numOtherDeploysOnAgent > 0) {
                LOG.trace("Rejecting SEPARATE_BY_REQUEST task {} from agent {} ({}) due to numOnAgent {} numCleaningOnAgent {} numOtherDeploysOnAgent {}", taskRequest.getRequest().getId(), agentId, host, numOnAgent, numCleaningOnAgent, numOtherDeploysOnAgent);
                return AgentMatchState.AGENT_SATURATED;
            }
            break;
        case OPTIMISTIC:
            // If no tasks are active for this request yet, we can fall back to greedy.
            if (activeTaskIdsForRequest.size() > 0) {
                Collection<SingularityPendingTaskId> pendingTasksForRequestClusterwide = leaderCache.getPendingTaskIdsForRequest(taskRequest.getRequest().getId());
                Set<String> currentHostsForRequest = activeTaskIdsForRequest.stream().map(SingularityTaskId::getSanitizedHost).collect(Collectors.toSet());
                final double numPerAgent = activeTaskIdsForRequest.size() / (double) currentHostsForRequest.size();
                final double leniencyCoefficient = configuration.getPlacementLeniency();
                final double threshold = numPerAgent * (1 + (pendingTasksForRequestClusterwide.size() * leniencyCoefficient));
                final boolean isOk = numOnAgent <= threshold;
                if (!isOk) {
                    LOG.trace("Rejecting OPTIMISTIC task {} from agent {} ({}) because numOnAgent {} violates threshold {} (based on active tasks for request {}, current hosts for request {}, pending tasks for request {})", taskRequest.getRequest().getId(), agentId, host, numOnAgent, threshold, activeTaskIdsForRequest.size(), currentHostsForRequest.size(), pendingTasksForRequestClusterwide.size());
                    return AgentMatchState.AGENT_SATURATED;
                }
            }
            break;
        case GREEDY:
    }
    if (isPreferred(offerHolder, taskRequest, requestUtilization)) {
        LOG.debug("Agent {} is preferred", offerHolder.getHostname());
        return AgentMatchState.PREFERRED_AGENT;
    }
    return AgentMatchState.OK;
}
Also used : SingularityPendingTaskId(com.hubspot.singularity.SingularityPendingTaskId) SingularityTask(com.hubspot.singularity.SingularityTask) SingularityPendingTask(com.hubspot.singularity.SingularityPendingTask) SingularityAgent(com.hubspot.singularity.SingularityAgent) AgentPlacement(com.hubspot.singularity.AgentPlacement) SingularityTaskId(com.hubspot.singularity.SingularityTaskId) MachineState(com.hubspot.singularity.MachineState)

Aggregations

SingularityAgent (com.hubspot.singularity.SingularityAgent)26 Test (org.junit.jupiter.api.Test)13 SingularityMachineChangeRequest (com.hubspot.singularity.api.SingularityMachineChangeRequest)6 MesosMasterStateObject (com.hubspot.mesos.json.MesosMasterStateObject)4 MachineState (com.hubspot.singularity.MachineState)4 SingularityRack (com.hubspot.singularity.SingularityRack)4 ArrayList (java.util.ArrayList)4 SingularityTaskId (com.hubspot.singularity.SingularityTaskId)3 MesosMasterAgentObject (com.hubspot.mesos.json.MesosMasterAgentObject)2 SingularityMachineStateHistoryUpdate (com.hubspot.singularity.SingularityMachineStateHistoryUpdate)2 SingularityPendingTaskId (com.hubspot.singularity.SingularityPendingTaskId)2 SingularityTask (com.hubspot.singularity.SingularityTask)2 HashSet (java.util.HashSet)2 Optional (java.util.Optional)2 Timed (com.codahale.metrics.annotation.Timed)1 HashMultiset (com.google.common.collect.HashMultiset)1 Multiset (com.google.common.collect.Multiset)1 Inject (com.google.inject.Inject)1 Singleton (com.google.inject.Singleton)1 AgentPlacement (com.hubspot.singularity.AgentPlacement)1