Search in sources :

Example 1 with TaskQueueException

use of com.netflix.fenzo.queues.TaskQueueException in project mantis by Netflix.

the class SchedulingService method schedulingResultHandler.

private void schedulingResultHandler(SchedulingResult schedulingResult) {
    try {
        lastSchedulingResultCallback.set(System.currentTimeMillis());
        final List<Exception> exceptions = schedulingResult.getExceptions();
        for (Exception exc : exceptions) {
            logger.error("Scheduling result got exception: {}", exc.getMessage(), exc);
            schedulingResultExceptions.increment();
        }
        int workersLaunched = 0;
        SchedulerCounters.getInstance().incrementResourceAllocationTrials(schedulingResult.getNumAllocations());
        Map<String, VMAssignmentResult> assignmentResultMap = schedulingResult.getResultMap();
        final int assignmentResultSize;
        if (assignmentResultMap != null) {
            assignmentResultSize = assignmentResultMap.size();
            long now = System.currentTimeMillis();
            for (Map.Entry<String, VMAssignmentResult> aResult : assignmentResultMap.entrySet()) {
                launchTasks(aResult.getValue().getTasksAssigned(), aResult.getValue().getLeasesUsed());
                for (TaskAssignmentResult r : aResult.getValue().getTasksAssigned()) {
                    final ScheduleRequest request = (ScheduleRequest) r.getRequest();
                    final Optional<Long> acceptedAt = workerRegistry.getAcceptedAt(request.getWorkerId());
                    acceptedAt.ifPresent(acceptedAtTime -> workerAcceptedToLaunchedDistMs.recordValue(now - acceptedAtTime));
                    perWorkerSchedulingTimeMs.increment(now - request.getReadyAt());
                }
                workersLaunched += aResult.getValue().getTasksAssigned().size();
            }
        } else {
            assignmentResultSize = 0;
        }
        // for workers that didn't get scheduled, rate limit them
        for (Map.Entry<TaskRequest, List<TaskAssignmentResult>> entry : schedulingResult.getFailures().entrySet()) {
            final ScheduleRequest req = (ScheduleRequest) entry.getKey();
            boolean success = jobMessageRouter.routeWorkerEvent(new WorkerUnscheduleable(req.getWorkerId(), req.getStageNum()));
            if (!success) {
                logger.warn("Failed to route {} WorkerUnscheduleable event", req.getWorkerId());
                if (logger.isTraceEnabled()) {
                    logger.trace("Unscheduleable worker {} assignmentresults {}", req.getWorkerId(), entry.getValue());
                }
            }
        }
        numWorkersLaunched.increment(workersLaunched);
        numResourceOffersReceived.increment(schedulingResult.getLeasesAdded());
        numResourceAllocations.increment(schedulingResult.getNumAllocations());
        numResourceOffersRejected.increment(schedulingResult.getLeasesRejected());
        final int requestedWorkers = workersLaunched + schedulingResult.getFailures().size();
        workersToLaunch.set(requestedWorkers);
        pendingWorkers.set(schedulingResult.getFailures().size());
        schedulerRunMillis.set(schedulingResult.getRuntime());
        totalActiveAgents.set(schedulingResult.getTotalVMsCount());
        numAgentsUsed.increment(assignmentResultSize);
        final int idleVMsCount = schedulingResult.getIdleVMsCount();
        idleAgents.set(idleVMsCount);
        SchedulerCounters.getInstance().endIteration(requestedWorkers, workersLaunched, assignmentResultSize, schedulingResult.getLeasesRejected());
        if (requestedWorkers > 0 && SchedulerCounters.getInstance().getCounter().getIterationNumber() % 10 == 0) {
            logger.info("Scheduling iteration result: " + SchedulerCounters.getInstance().toJsonString());
        }
        if (idleVMsCount != idleMachinesCount.get()) {
            logger.info("Idle machines: " + idleVMsCount);
            idleMachinesCount.set(idleVMsCount);
        }
        try {
            taskSchedulingService.requestVmCurrentStates(vmCurrentStates -> {
                if (lastVmCurrentStatesCheckDone.get() < (System.currentTimeMillis() - vmCurrentStatesCheckInterval)) {
                    schedulingState.setVMCurrentState(vmCurrentStates);
                    verifyAndReportResUsageMetrics(vmCurrentStates);
                    lastVmCurrentStatesCheckDone.set(System.currentTimeMillis());
                }
            });
        } catch (final TaskQueueException e) {
            logger.warn("got exception requesting VM states from Fenzo", e);
        }
        publishJobManagerAndFenzoWorkerMetrics();
    } catch (final Exception e) {
        logger.error("unexpected exception in scheduling result callback", e);
        schedulingCallbackExceptions.increment();
    }
}
Also used : ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) TaskRequest(com.netflix.fenzo.TaskRequest) LaunchTaskRequest(io.mantisrx.server.master.scheduler.LaunchTaskRequest) TaskQueueException(com.netflix.fenzo.queues.TaskQueueException) AtomicLong(java.util.concurrent.atomic.AtomicLong) TaskAssignmentResult(com.netflix.fenzo.TaskAssignmentResult) List(java.util.List) ArrayList(java.util.ArrayList) VMAssignmentResult(com.netflix.fenzo.VMAssignmentResult) WorkerUnscheduleable(io.mantisrx.server.master.scheduler.WorkerUnscheduleable) Map(java.util.Map) TaskQueueException(com.netflix.fenzo.queues.TaskQueueException)

Aggregations

TaskAssignmentResult (com.netflix.fenzo.TaskAssignmentResult)1 TaskRequest (com.netflix.fenzo.TaskRequest)1 VMAssignmentResult (com.netflix.fenzo.VMAssignmentResult)1 TaskQueueException (com.netflix.fenzo.queues.TaskQueueException)1 LaunchTaskRequest (io.mantisrx.server.master.scheduler.LaunchTaskRequest)1 ScheduleRequest (io.mantisrx.server.master.scheduler.ScheduleRequest)1 WorkerUnscheduleable (io.mantisrx.server.master.scheduler.WorkerUnscheduleable)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1