Search in sources :

Example 1 with ScheduleRequest

use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.

the class TestHelpers method createFakeScheduleRequest.

public static ScheduleRequest createFakeScheduleRequest(final WorkerId workerId, final int stageNum, final int numStages, final MachineDefinition machineDefinition) {
    try {
        JobDefinition jobDefinition = new JobDefinition.Builder().withArtifactName("jar").withSchedulingInfo(new SchedulingInfo(Collections.singletonMap(0, StageSchedulingInfo.builder().numberOfInstances(1).machineDefinition(machineDefinition).hardConstraints(Collections.emptyList()).softConstraints(Collections.emptyList()).build()))).withJobSla(new JobSla(0, 0, null, MantisJobDurationType.Perpetual, null)).build();
        IMantisJobMetadata mantisJobMetadata = new MantisJobMetadataImpl.Builder().withJobId(JobId.fromId(workerId.getJobId()).get()).withJobDefinition(jobDefinition).build();
        return new ScheduleRequest(workerId, stageNum, numStages, new JobMetadata(mantisJobMetadata.getJobId().getId(), mantisJobMetadata.getJobJarUrl(), mantisJobMetadata.getTotalStages(), mantisJobMetadata.getUser(), mantisJobMetadata.getSchedulingInfo(), mantisJobMetadata.getParameters(), mantisJobMetadata.getSubscriptionTimeoutSecs(), mantisJobMetadata.getMinRuntimeSecs()), mantisJobMetadata.getSla().get().getDurationType(), machineDefinition, Collections.emptyList(), Collections.emptyList(), 0, Optional.empty());
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}
Also used : JobMetadata(io.mantisrx.server.core.domain.JobMetadata) IMantisJobMetadata(io.mantisrx.master.jobcluster.job.IMantisJobMetadata) StageSchedulingInfo(io.mantisrx.runtime.descriptor.StageSchedulingInfo) SchedulingInfo(io.mantisrx.runtime.descriptor.SchedulingInfo) ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) IMantisJobMetadata(io.mantisrx.master.jobcluster.job.IMantisJobMetadata) JobSla(io.mantisrx.runtime.JobSla) MantisJobMetadataImpl(io.mantisrx.master.jobcluster.job.MantisJobMetadataImpl) JobDefinition(io.mantisrx.server.master.domain.JobDefinition)

Example 2 with ScheduleRequest

use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.

the class SchedulingService method schedulingResultHandler.

private void schedulingResultHandler(SchedulingResult schedulingResult) {
    try {
        lastSchedulingResultCallback.set(System.currentTimeMillis());
        final List<Exception> exceptions = schedulingResult.getExceptions();
        for (Exception exc : exceptions) {
            logger.error("Scheduling result got exception: {}", exc.getMessage(), exc);
            schedulingResultExceptions.increment();
        }
        int workersLaunched = 0;
        SchedulerCounters.getInstance().incrementResourceAllocationTrials(schedulingResult.getNumAllocations());
        Map<String, VMAssignmentResult> assignmentResultMap = schedulingResult.getResultMap();
        final int assignmentResultSize;
        if (assignmentResultMap != null) {
            assignmentResultSize = assignmentResultMap.size();
            long now = System.currentTimeMillis();
            for (Map.Entry<String, VMAssignmentResult> aResult : assignmentResultMap.entrySet()) {
                launchTasks(aResult.getValue().getTasksAssigned(), aResult.getValue().getLeasesUsed());
                for (TaskAssignmentResult r : aResult.getValue().getTasksAssigned()) {
                    final ScheduleRequest request = (ScheduleRequest) r.getRequest();
                    final Optional<Long> acceptedAt = workerRegistry.getAcceptedAt(request.getWorkerId());
                    acceptedAt.ifPresent(acceptedAtTime -> workerAcceptedToLaunchedDistMs.recordValue(now - acceptedAtTime));
                    perWorkerSchedulingTimeMs.increment(now - request.getReadyAt());
                }
                workersLaunched += aResult.getValue().getTasksAssigned().size();
            }
        } else {
            assignmentResultSize = 0;
        }
        // for workers that didn't get scheduled, rate limit them
        for (Map.Entry<TaskRequest, List<TaskAssignmentResult>> entry : schedulingResult.getFailures().entrySet()) {
            final ScheduleRequest req = (ScheduleRequest) entry.getKey();
            boolean success = jobMessageRouter.routeWorkerEvent(new WorkerUnscheduleable(req.getWorkerId(), req.getStageNum()));
            if (!success) {
                logger.warn("Failed to route {} WorkerUnscheduleable event", req.getWorkerId());
                if (logger.isTraceEnabled()) {
                    logger.trace("Unscheduleable worker {} assignmentresults {}", req.getWorkerId(), entry.getValue());
                }
            }
        }
        numWorkersLaunched.increment(workersLaunched);
        numResourceOffersReceived.increment(schedulingResult.getLeasesAdded());
        numResourceAllocations.increment(schedulingResult.getNumAllocations());
        numResourceOffersRejected.increment(schedulingResult.getLeasesRejected());
        final int requestedWorkers = workersLaunched + schedulingResult.getFailures().size();
        workersToLaunch.set(requestedWorkers);
        pendingWorkers.set(schedulingResult.getFailures().size());
        schedulerRunMillis.set(schedulingResult.getRuntime());
        totalActiveAgents.set(schedulingResult.getTotalVMsCount());
        numAgentsUsed.increment(assignmentResultSize);
        final int idleVMsCount = schedulingResult.getIdleVMsCount();
        idleAgents.set(idleVMsCount);
        SchedulerCounters.getInstance().endIteration(requestedWorkers, workersLaunched, assignmentResultSize, schedulingResult.getLeasesRejected());
        if (requestedWorkers > 0 && SchedulerCounters.getInstance().getCounter().getIterationNumber() % 10 == 0) {
            logger.info("Scheduling iteration result: " + SchedulerCounters.getInstance().toJsonString());
        }
        if (idleVMsCount != idleMachinesCount.get()) {
            logger.info("Idle machines: " + idleVMsCount);
            idleMachinesCount.set(idleVMsCount);
        }
        try {
            taskSchedulingService.requestVmCurrentStates(vmCurrentStates -> {
                if (lastVmCurrentStatesCheckDone.get() < (System.currentTimeMillis() - vmCurrentStatesCheckInterval)) {
                    schedulingState.setVMCurrentState(vmCurrentStates);
                    verifyAndReportResUsageMetrics(vmCurrentStates);
                    lastVmCurrentStatesCheckDone.set(System.currentTimeMillis());
                }
            });
        } catch (final TaskQueueException e) {
            logger.warn("got exception requesting VM states from Fenzo", e);
        }
        publishJobManagerAndFenzoWorkerMetrics();
    } catch (final Exception e) {
        logger.error("unexpected exception in scheduling result callback", e);
        schedulingCallbackExceptions.increment();
    }
}
Also used : ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) TaskRequest(com.netflix.fenzo.TaskRequest) LaunchTaskRequest(io.mantisrx.server.master.scheduler.LaunchTaskRequest) TaskQueueException(com.netflix.fenzo.queues.TaskQueueException) AtomicLong(java.util.concurrent.atomic.AtomicLong) TaskAssignmentResult(com.netflix.fenzo.TaskAssignmentResult) List(java.util.List) ArrayList(java.util.ArrayList) VMAssignmentResult(com.netflix.fenzo.VMAssignmentResult) WorkerUnscheduleable(io.mantisrx.server.master.scheduler.WorkerUnscheduleable) Map(java.util.Map) TaskQueueException(com.netflix.fenzo.queues.TaskQueueException)

Example 3 with ScheduleRequest

use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.

the class SchedulingService method launchTasks.

/**
 * Attempts to launch tasks given some number of leases from Mesos.
 *
 * When a task is launched successfully, the following will happen:
 *
 * 1. Emit a {@link WorkerLaunched} event to be handled by the corresponding actor.
 * 2. Makes a call to the underlying Mesos driver to launch the task.
 *
 * A task can fail to launch if:
 *
 * 1. It doesn't receive enough metadata for {@link WorkerPorts} to pass its preconditions.
 *      - No launch task request will be made for this assignment result.
 *      - Proactively unschedule the worker.
 * 2. It fails to emit a {@link WorkerLaunched} event.
 *      - The worker will get unscheduled for this launch task request.
 * 3. There are no launch tasks for this assignment result.
 *      - All of these leases are rejected.
 *      - Eventually, the underlying Mesos driver will decline offers since there are no launch task requests.
 *
 * @param requests collection of assignment results received by the scheduler.
 * @param leases list of resource offers from Mesos.
 */
private void launchTasks(Collection<TaskAssignmentResult> requests, List<VirtualMachineLease> leases) {
    List<LaunchTaskRequest> launchTaskRequests = new ArrayList<>();
    for (TaskAssignmentResult assignmentResult : requests) {
        ScheduleRequest request = (ScheduleRequest) assignmentResult.getRequest();
        WorkerPorts workerPorts = null;
        try {
            workerPorts = new WorkerPorts(assignmentResult.getAssignedPorts());
        } catch (IllegalArgumentException | IllegalStateException e) {
            logger.error("problem launching tasks for assignment result {}: {}", assignmentResult, e);
            numMissingWorkerPorts.increment();
        }
        if (workerPorts != null) {
            boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunched(request.getWorkerId(), request.getStageNum(), leases.get(0).hostname(), leases.get(0).getVMID(), getAttribute(leases.get(0), slaveClusterAttributeName), workerPorts));
            if (success) {
                launchTaskRequests.add(new LaunchTaskRequest(request, workerPorts));
            } else {
                unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
            }
        } else {
            unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
        }
    }
    if (launchTaskRequests.isEmpty()) {
        for (VirtualMachineLease l : leases) virtualMachineService.rejectLease(l);
    }
    Map<ScheduleRequest, LaunchTaskException> launchErrors = virtualMachineService.launchTasks(launchTaskRequests, leases);
    for (TaskAssignmentResult result : requests) {
        final ScheduleRequest sre = (ScheduleRequest) result.getRequest();
        if (launchErrors.containsKey(sre)) {
            String errorMessage = getWorkerStringPrefix(sre.getStageNum(), sre.getWorkerId()) + " failed due to " + launchErrors.get(sre).getMessage();
            boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunchFailed(sre.getWorkerId(), sre.getStageNum(), errorMessage));
            if (!success) {
                logger.warn("Failed to route WorkerLaunchFailed for {} (err {})", sre.getWorkerId(), errorMessage);
            }
        }
    }
}
Also used : ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) LaunchTaskRequest(io.mantisrx.server.master.scheduler.LaunchTaskRequest) ArrayList(java.util.ArrayList) VirtualMachineLease(com.netflix.fenzo.VirtualMachineLease) WorkerPorts(io.mantisrx.common.WorkerPorts) TaskAssignmentResult(com.netflix.fenzo.TaskAssignmentResult) WorkerLaunchFailed(io.mantisrx.server.master.scheduler.WorkerLaunchFailed) WorkerLaunched(io.mantisrx.server.master.scheduler.WorkerLaunched)

Example 4 with ScheduleRequest

use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.

the class VirtualMachineMasterServiceMesosImpl method createTaskInfo.

private Collection<TaskInfo> createTaskInfo(Protos.SlaveID slaveID, final LaunchTaskRequest launchTaskRequest) throws LaunchTaskException {
    final ScheduleRequest scheduleRequest = launchTaskRequest.getScheduleRequest();
    String name = scheduleRequest.getWorkerId().getJobCluster() + " (stage: " + scheduleRequest.getStageNum() + " of " + scheduleRequest.getJobMetadata().getTotalStages() + ")";
    TaskID taskId = TaskID.newBuilder().setValue(scheduleRequest.getWorkerId().getId()).build();
    MachineDefinition machineDefinition = scheduleRequest.getMachineDefinition();
    // grab ports within range
    List<Integer> ports = launchTaskRequest.getPorts().getAllPorts();
    TaskInfo taskInfo = null;
    try {
        TaskInfo.Builder taskInfoBuilder = TaskInfo.newBuilder();
        ExecuteStageRequest executeStageRequest = new ExecuteStageRequest(scheduleRequest.getWorkerId().getJobCluster(), scheduleRequest.getWorkerId().getJobId(), scheduleRequest.getWorkerId().getWorkerIndex(), scheduleRequest.getWorkerId().getWorkerNum(), scheduleRequest.getJobMetadata().getJobJarUrl(), scheduleRequest.getStageNum(), scheduleRequest.getJobMetadata().getTotalStages(), ports, getTimeoutSecsToReportStart(), launchTaskRequest.getPorts().getMetricsPort(), scheduleRequest.getJobMetadata().getParameters(), scheduleRequest.getJobMetadata().getSchedulingInfo(), scheduleRequest.getDurationType(), scheduleRequest.getJobMetadata().getSubscriptionTimeoutSecs(), scheduleRequest.getJobMetadata().getMinRuntimeSecs() - (System.currentTimeMillis() - scheduleRequest.getJobMetadata().getMinRuntimeSecs()), launchTaskRequest.getPorts());
        taskInfoBuilder.setName(name).setTaskId(taskId).setSlaveId(slaveID).addResources(Resource.newBuilder().setName("cpus").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getCpuCores()))).addResources(Resource.newBuilder().setName("mem").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getMemoryMB()))).addResources(Resource.newBuilder().setName("disk").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getDiskMB()))).addResources(Resource.newBuilder().setName("network").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getNetworkMbps()))).setExecutor(createMantisWorkerExecutor(executeStageRequest, launchTaskRequest, machineDefinition.getMemoryMB(), machineDefinition.getCpuCores())).setData(ByteString.copyFrom(mapper.writeValueAsBytes(executeStageRequest)));
        if (!ports.isEmpty()) {
            for (Integer port : ports) {
                // add ports
                taskInfoBuilder.addResources(Resource.newBuilder().setName("ports").setType(Value.Type.RANGES).setRanges(Value.Ranges.newBuilder().addRange(Value.Range.newBuilder().setBegin(port).setEnd(port))));
            }
        }
        taskInfo = taskInfoBuilder.build();
    } catch (JsonProcessingException e) {
        throw new LaunchTaskException("Failed to build a TaskInfo instance: " + e.getMessage(), e);
    }
    List<TaskInfo> tasks = new ArrayList<>(1);
    tasks.add(taskInfo);
    return tasks;
}
Also used : TaskID(org.apache.mesos.Protos.TaskID) MachineDefinition(io.mantisrx.runtime.MachineDefinition) ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) ArrayList(java.util.ArrayList) LaunchTaskException(io.mantisrx.server.master.LaunchTaskException) ByteString(com.google.protobuf.ByteString) ExecuteStageRequest(io.mantisrx.server.core.ExecuteStageRequest) TaskInfo(org.apache.mesos.Protos.TaskInfo) JsonProcessingException(io.mantisrx.shaded.com.fasterxml.jackson.core.JsonProcessingException)

Example 5 with ScheduleRequest

use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.

the class VirtualMachineMasterServiceMesosImpl method launchTasks.

// NOTE: All leases are for the same agent.
@Override
public Map<ScheduleRequest, LaunchTaskException> launchTasks(List<LaunchTaskRequest> requests, List<VirtualMachineLease> leases) {
    if (!super.getIsInited()) {
        logger.error("Not in leader mode, not launching tasks");
        return new HashMap<>();
    }
    Protos.SlaveID slaveID = leases.get(0).getOffer().getSlaveId();
    List<Protos.OfferID> offerIDs = new ArrayList<>();
    for (VirtualMachineLease vml : leases) offerIDs.add(vml.getOffer().getId());
    Map<ScheduleRequest, LaunchTaskException> errorResults = new HashMap<>();
    List<TaskInfo> taskInfos = new ArrayList<>();
    for (LaunchTaskRequest request : requests) {
        try {
            taskInfos.addAll(createTaskInfo(slaveID, request));
        } catch (LaunchTaskException e) {
            errorResults.put(request.getScheduleRequest(), e);
        }
    }
    if (!taskInfos.isEmpty())
        mesosDriver.get().launchTasks(offerIDs, taskInfos);
    else {
        // reject offers to prevent offer leak, but shouldn't happen
        for (VirtualMachineLease l : leases) {
            mesosDriver.get().declineOffer(l.getOffer().getId());
        }
    }
    return errorResults;
}
Also used : ScheduleRequest(io.mantisrx.server.master.scheduler.ScheduleRequest) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LaunchTaskRequest(io.mantisrx.server.master.scheduler.LaunchTaskRequest) LaunchTaskException(io.mantisrx.server.master.LaunchTaskException) VirtualMachineLease(com.netflix.fenzo.VirtualMachineLease) TaskInfo(org.apache.mesos.Protos.TaskInfo) Protos(org.apache.mesos.Protos)

Aggregations

ScheduleRequest (io.mantisrx.server.master.scheduler.ScheduleRequest)7 ArrayList (java.util.ArrayList)4 TaskAssignmentResult (com.netflix.fenzo.TaskAssignmentResult)3 LaunchTaskRequest (io.mantisrx.server.master.scheduler.LaunchTaskRequest)3 TaskRequest (com.netflix.fenzo.TaskRequest)2 VirtualMachineLease (com.netflix.fenzo.VirtualMachineLease)2 JobSla (io.mantisrx.runtime.JobSla)2 MachineDefinition (io.mantisrx.runtime.MachineDefinition)2 SchedulingInfo (io.mantisrx.runtime.descriptor.SchedulingInfo)2 StageSchedulingInfo (io.mantisrx.runtime.descriptor.StageSchedulingInfo)2 JobMetadata (io.mantisrx.server.core.domain.JobMetadata)2 LaunchTaskException (io.mantisrx.server.master.LaunchTaskException)2 JobDefinition (io.mantisrx.server.master.domain.JobDefinition)2 TaskInfo (org.apache.mesos.Protos.TaskInfo)2 ActorRef (akka.actor.ActorRef)1 TestKit (akka.testkit.javadsl.TestKit)1 ByteString (com.google.protobuf.ByteString)1 VMAssignmentResult (com.netflix.fenzo.VMAssignmentResult)1 TaskQueueException (com.netflix.fenzo.queues.TaskQueueException)1 WorkerPorts (io.mantisrx.common.WorkerPorts)1