use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.
the class TestHelpers method createFakeScheduleRequest.
public static ScheduleRequest createFakeScheduleRequest(final WorkerId workerId, final int stageNum, final int numStages, final MachineDefinition machineDefinition) {
try {
JobDefinition jobDefinition = new JobDefinition.Builder().withArtifactName("jar").withSchedulingInfo(new SchedulingInfo(Collections.singletonMap(0, StageSchedulingInfo.builder().numberOfInstances(1).machineDefinition(machineDefinition).hardConstraints(Collections.emptyList()).softConstraints(Collections.emptyList()).build()))).withJobSla(new JobSla(0, 0, null, MantisJobDurationType.Perpetual, null)).build();
IMantisJobMetadata mantisJobMetadata = new MantisJobMetadataImpl.Builder().withJobId(JobId.fromId(workerId.getJobId()).get()).withJobDefinition(jobDefinition).build();
return new ScheduleRequest(workerId, stageNum, numStages, new JobMetadata(mantisJobMetadata.getJobId().getId(), mantisJobMetadata.getJobJarUrl(), mantisJobMetadata.getTotalStages(), mantisJobMetadata.getUser(), mantisJobMetadata.getSchedulingInfo(), mantisJobMetadata.getParameters(), mantisJobMetadata.getSubscriptionTimeoutSecs(), mantisJobMetadata.getMinRuntimeSecs()), mantisJobMetadata.getSla().get().getDurationType(), machineDefinition, Collections.emptyList(), Collections.emptyList(), 0, Optional.empty());
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.
the class SchedulingService method schedulingResultHandler.
private void schedulingResultHandler(SchedulingResult schedulingResult) {
try {
lastSchedulingResultCallback.set(System.currentTimeMillis());
final List<Exception> exceptions = schedulingResult.getExceptions();
for (Exception exc : exceptions) {
logger.error("Scheduling result got exception: {}", exc.getMessage(), exc);
schedulingResultExceptions.increment();
}
int workersLaunched = 0;
SchedulerCounters.getInstance().incrementResourceAllocationTrials(schedulingResult.getNumAllocations());
Map<String, VMAssignmentResult> assignmentResultMap = schedulingResult.getResultMap();
final int assignmentResultSize;
if (assignmentResultMap != null) {
assignmentResultSize = assignmentResultMap.size();
long now = System.currentTimeMillis();
for (Map.Entry<String, VMAssignmentResult> aResult : assignmentResultMap.entrySet()) {
launchTasks(aResult.getValue().getTasksAssigned(), aResult.getValue().getLeasesUsed());
for (TaskAssignmentResult r : aResult.getValue().getTasksAssigned()) {
final ScheduleRequest request = (ScheduleRequest) r.getRequest();
final Optional<Long> acceptedAt = workerRegistry.getAcceptedAt(request.getWorkerId());
acceptedAt.ifPresent(acceptedAtTime -> workerAcceptedToLaunchedDistMs.recordValue(now - acceptedAtTime));
perWorkerSchedulingTimeMs.increment(now - request.getReadyAt());
}
workersLaunched += aResult.getValue().getTasksAssigned().size();
}
} else {
assignmentResultSize = 0;
}
// for workers that didn't get scheduled, rate limit them
for (Map.Entry<TaskRequest, List<TaskAssignmentResult>> entry : schedulingResult.getFailures().entrySet()) {
final ScheduleRequest req = (ScheduleRequest) entry.getKey();
boolean success = jobMessageRouter.routeWorkerEvent(new WorkerUnscheduleable(req.getWorkerId(), req.getStageNum()));
if (!success) {
logger.warn("Failed to route {} WorkerUnscheduleable event", req.getWorkerId());
if (logger.isTraceEnabled()) {
logger.trace("Unscheduleable worker {} assignmentresults {}", req.getWorkerId(), entry.getValue());
}
}
}
numWorkersLaunched.increment(workersLaunched);
numResourceOffersReceived.increment(schedulingResult.getLeasesAdded());
numResourceAllocations.increment(schedulingResult.getNumAllocations());
numResourceOffersRejected.increment(schedulingResult.getLeasesRejected());
final int requestedWorkers = workersLaunched + schedulingResult.getFailures().size();
workersToLaunch.set(requestedWorkers);
pendingWorkers.set(schedulingResult.getFailures().size());
schedulerRunMillis.set(schedulingResult.getRuntime());
totalActiveAgents.set(schedulingResult.getTotalVMsCount());
numAgentsUsed.increment(assignmentResultSize);
final int idleVMsCount = schedulingResult.getIdleVMsCount();
idleAgents.set(idleVMsCount);
SchedulerCounters.getInstance().endIteration(requestedWorkers, workersLaunched, assignmentResultSize, schedulingResult.getLeasesRejected());
if (requestedWorkers > 0 && SchedulerCounters.getInstance().getCounter().getIterationNumber() % 10 == 0) {
logger.info("Scheduling iteration result: " + SchedulerCounters.getInstance().toJsonString());
}
if (idleVMsCount != idleMachinesCount.get()) {
logger.info("Idle machines: " + idleVMsCount);
idleMachinesCount.set(idleVMsCount);
}
try {
taskSchedulingService.requestVmCurrentStates(vmCurrentStates -> {
if (lastVmCurrentStatesCheckDone.get() < (System.currentTimeMillis() - vmCurrentStatesCheckInterval)) {
schedulingState.setVMCurrentState(vmCurrentStates);
verifyAndReportResUsageMetrics(vmCurrentStates);
lastVmCurrentStatesCheckDone.set(System.currentTimeMillis());
}
});
} catch (final TaskQueueException e) {
logger.warn("got exception requesting VM states from Fenzo", e);
}
publishJobManagerAndFenzoWorkerMetrics();
} catch (final Exception e) {
logger.error("unexpected exception in scheduling result callback", e);
schedulingCallbackExceptions.increment();
}
}
use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.
the class SchedulingService method launchTasks.
/**
* Attempts to launch tasks given some number of leases from Mesos.
*
* When a task is launched successfully, the following will happen:
*
* 1. Emit a {@link WorkerLaunched} event to be handled by the corresponding actor.
* 2. Makes a call to the underlying Mesos driver to launch the task.
*
* A task can fail to launch if:
*
* 1. It doesn't receive enough metadata for {@link WorkerPorts} to pass its preconditions.
* - No launch task request will be made for this assignment result.
* - Proactively unschedule the worker.
* 2. It fails to emit a {@link WorkerLaunched} event.
* - The worker will get unscheduled for this launch task request.
* 3. There are no launch tasks for this assignment result.
* - All of these leases are rejected.
* - Eventually, the underlying Mesos driver will decline offers since there are no launch task requests.
*
* @param requests collection of assignment results received by the scheduler.
* @param leases list of resource offers from Mesos.
*/
private void launchTasks(Collection<TaskAssignmentResult> requests, List<VirtualMachineLease> leases) {
List<LaunchTaskRequest> launchTaskRequests = new ArrayList<>();
for (TaskAssignmentResult assignmentResult : requests) {
ScheduleRequest request = (ScheduleRequest) assignmentResult.getRequest();
WorkerPorts workerPorts = null;
try {
workerPorts = new WorkerPorts(assignmentResult.getAssignedPorts());
} catch (IllegalArgumentException | IllegalStateException e) {
logger.error("problem launching tasks for assignment result {}: {}", assignmentResult, e);
numMissingWorkerPorts.increment();
}
if (workerPorts != null) {
boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunched(request.getWorkerId(), request.getStageNum(), leases.get(0).hostname(), leases.get(0).getVMID(), getAttribute(leases.get(0), slaveClusterAttributeName), workerPorts));
if (success) {
launchTaskRequests.add(new LaunchTaskRequest(request, workerPorts));
} else {
unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
}
} else {
unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
}
}
if (launchTaskRequests.isEmpty()) {
for (VirtualMachineLease l : leases) virtualMachineService.rejectLease(l);
}
Map<ScheduleRequest, LaunchTaskException> launchErrors = virtualMachineService.launchTasks(launchTaskRequests, leases);
for (TaskAssignmentResult result : requests) {
final ScheduleRequest sre = (ScheduleRequest) result.getRequest();
if (launchErrors.containsKey(sre)) {
String errorMessage = getWorkerStringPrefix(sre.getStageNum(), sre.getWorkerId()) + " failed due to " + launchErrors.get(sre).getMessage();
boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunchFailed(sre.getWorkerId(), sre.getStageNum(), errorMessage));
if (!success) {
logger.warn("Failed to route WorkerLaunchFailed for {} (err {})", sre.getWorkerId(), errorMessage);
}
}
}
}
use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.
the class VirtualMachineMasterServiceMesosImpl method createTaskInfo.
private Collection<TaskInfo> createTaskInfo(Protos.SlaveID slaveID, final LaunchTaskRequest launchTaskRequest) throws LaunchTaskException {
final ScheduleRequest scheduleRequest = launchTaskRequest.getScheduleRequest();
String name = scheduleRequest.getWorkerId().getJobCluster() + " (stage: " + scheduleRequest.getStageNum() + " of " + scheduleRequest.getJobMetadata().getTotalStages() + ")";
TaskID taskId = TaskID.newBuilder().setValue(scheduleRequest.getWorkerId().getId()).build();
MachineDefinition machineDefinition = scheduleRequest.getMachineDefinition();
// grab ports within range
List<Integer> ports = launchTaskRequest.getPorts().getAllPorts();
TaskInfo taskInfo = null;
try {
TaskInfo.Builder taskInfoBuilder = TaskInfo.newBuilder();
ExecuteStageRequest executeStageRequest = new ExecuteStageRequest(scheduleRequest.getWorkerId().getJobCluster(), scheduleRequest.getWorkerId().getJobId(), scheduleRequest.getWorkerId().getWorkerIndex(), scheduleRequest.getWorkerId().getWorkerNum(), scheduleRequest.getJobMetadata().getJobJarUrl(), scheduleRequest.getStageNum(), scheduleRequest.getJobMetadata().getTotalStages(), ports, getTimeoutSecsToReportStart(), launchTaskRequest.getPorts().getMetricsPort(), scheduleRequest.getJobMetadata().getParameters(), scheduleRequest.getJobMetadata().getSchedulingInfo(), scheduleRequest.getDurationType(), scheduleRequest.getJobMetadata().getSubscriptionTimeoutSecs(), scheduleRequest.getJobMetadata().getMinRuntimeSecs() - (System.currentTimeMillis() - scheduleRequest.getJobMetadata().getMinRuntimeSecs()), launchTaskRequest.getPorts());
taskInfoBuilder.setName(name).setTaskId(taskId).setSlaveId(slaveID).addResources(Resource.newBuilder().setName("cpus").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getCpuCores()))).addResources(Resource.newBuilder().setName("mem").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getMemoryMB()))).addResources(Resource.newBuilder().setName("disk").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getDiskMB()))).addResources(Resource.newBuilder().setName("network").setType(Value.Type.SCALAR).setScalar(Value.Scalar.newBuilder().setValue(machineDefinition.getNetworkMbps()))).setExecutor(createMantisWorkerExecutor(executeStageRequest, launchTaskRequest, machineDefinition.getMemoryMB(), machineDefinition.getCpuCores())).setData(ByteString.copyFrom(mapper.writeValueAsBytes(executeStageRequest)));
if (!ports.isEmpty()) {
for (Integer port : ports) {
// add ports
taskInfoBuilder.addResources(Resource.newBuilder().setName("ports").setType(Value.Type.RANGES).setRanges(Value.Ranges.newBuilder().addRange(Value.Range.newBuilder().setBegin(port).setEnd(port))));
}
}
taskInfo = taskInfoBuilder.build();
} catch (JsonProcessingException e) {
throw new LaunchTaskException("Failed to build a TaskInfo instance: " + e.getMessage(), e);
}
List<TaskInfo> tasks = new ArrayList<>(1);
tasks.add(taskInfo);
return tasks;
}
use of io.mantisrx.server.master.scheduler.ScheduleRequest in project mantis by Netflix.
the class VirtualMachineMasterServiceMesosImpl method launchTasks.
// NOTE: All leases are for the same agent.
@Override
public Map<ScheduleRequest, LaunchTaskException> launchTasks(List<LaunchTaskRequest> requests, List<VirtualMachineLease> leases) {
if (!super.getIsInited()) {
logger.error("Not in leader mode, not launching tasks");
return new HashMap<>();
}
Protos.SlaveID slaveID = leases.get(0).getOffer().getSlaveId();
List<Protos.OfferID> offerIDs = new ArrayList<>();
for (VirtualMachineLease vml : leases) offerIDs.add(vml.getOffer().getId());
Map<ScheduleRequest, LaunchTaskException> errorResults = new HashMap<>();
List<TaskInfo> taskInfos = new ArrayList<>();
for (LaunchTaskRequest request : requests) {
try {
taskInfos.addAll(createTaskInfo(slaveID, request));
} catch (LaunchTaskException e) {
errorResults.put(request.getScheduleRequest(), e);
}
}
if (!taskInfos.isEmpty())
mesosDriver.get().launchTasks(offerIDs, taskInfos);
else {
// reject offers to prevent offer leak, but shouldn't happen
for (VirtualMachineLease l : leases) {
mesosDriver.get().declineOffer(l.getOffer().getId());
}
}
return errorResults;
}
Aggregations