use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.
the class WorkerRegistryV2Test method testIsWorkerValid.
@Test
public void testIsWorkerValid() {
JobId jId = new JobId("testIsWorkerValid", 1);
WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
initRegistryWithWorkers(workerRegistryV2, "testIsWorkerValid-1", 5);
for (int i = 0; i < 5; i++) {
assertTrue(workerRegistryV2.isWorkerValid(new WorkerId(jId.getId(), i, i + 5)));
}
}
use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.
the class WorkerRegistryV2Test method testJobScaleUp.
@Test
public void testJobScaleUp() throws Exception, InvalidJobException, io.mantisrx.runtime.command.InvalidJobException {
WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
LifecycleEventPublisher eventPublisher = new LifecycleEventPublisherImpl(new AuditEventSubscriberLoggingImpl(), new StatusEventSubscriberLoggingImpl(), new DummyWorkerEventSubscriberImpl(workerRegistryV2));
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.CPU, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.CPU, 0.5, 0.75, null));
smap.put(StageScalingPolicy.ScalingReason.DataDrop, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.DataDrop, 0.0, 2.0, null));
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerScalableStageWithConstraints(1, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList(), new StageScalingPolicy(1, 0, 10, 1, 1, 0, smap)).build();
String clusterName = "testJobScaleUp";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
ActorRef jobActor = JobTestHelper.submitSingleStageScalableJob(system, probe, clusterName, sInfo, schedulerMock, jobStoreMock, eventPublisher);
assertEquals(2, workerRegistryV2.getNumRunningWorkers());
// send scale up request
jobActor.tell(new JobClusterManagerProto.ScaleStageRequest(clusterName + "-1", 1, 2, "", ""), probe.getRef());
JobClusterManagerProto.ScaleStageResponse scaleResp = probe.expectMsgClass(JobClusterManagerProto.ScaleStageResponse.class);
System.out.println("ScaleupResp " + scaleResp.message);
assertEquals(SUCCESS, scaleResp.responseCode);
assertEquals(2, scaleResp.getActualNumWorkers());
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, clusterName + "-1", 0, new WorkerId(clusterName + "-1", 1, 3));
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("user", new JobId(clusterName, 1)), probe.getRef());
JobClusterManagerProto.GetJobDetailsResponse resp = probe.expectMsgClass(JobClusterManagerProto.GetJobDetailsResponse.class);
Map<Integer, ? extends IMantisStageMetadata> stageMetadata = resp.getJobMetadata().get().getStageMetadata();
assertEquals(2, stageMetadata.get(1).getAllWorkers().size());
int cnt = 0;
for (int i = 0; i < 50; i++) {
cnt++;
if (workerRegistryV2.getNumRunningWorkers() == 3) {
break;
}
}
assertTrue(cnt < 50);
}
use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.
the class SchedulingService method verifyAndReportResUsageMetrics.
private void verifyAndReportResUsageMetrics(List<VirtualMachineCurrentState> vmCurrentStates) {
double totalCPU = 0.0;
double usedCPU = 0.0;
double totalMemory = 0.0;
double usedMemory = 0.0;
double totalNwMbps = 0.0;
double usedNwMbps = 0.0;
for (VirtualMachineCurrentState state : vmCurrentStates) {
final VirtualMachineLease currAvailableResources = state.getCurrAvailableResources();
if (currAvailableResources != null) {
totalCPU += currAvailableResources.cpuCores();
totalMemory += currAvailableResources.memoryMB();
totalNwMbps += currAvailableResources.networkMbps();
}
final Collection<TaskRequest> runningTasks = state.getRunningTasks();
if (runningTasks != null) {
for (TaskRequest t : runningTasks) {
Optional<WorkerId> workerId = WorkerId.fromId(t.getId());
if (!workerId.isPresent() || !workerRegistry.isWorkerValid(workerId.get())) {
taskSchedulingService.removeTask(t.getId(), DEFAULT_Q_ATTRIBUTES, state.getHostname());
} else {
usedCPU += t.getCPUs();
totalCPU += t.getCPUs();
usedMemory += t.getMemory();
totalMemory += t.getMemory();
usedNwMbps += t.getNetworkMbps();
totalNwMbps += t.getNetworkMbps();
}
}
}
}
totalAvailableCPUs.set((long) totalCPU);
totalAllocatedCPUs.set((long) usedCPU);
cpuUtilization.set((long) (usedCPU * 100.0 / totalCPU));
double DRU = usedCPU * 100.0 / totalCPU;
totalAvailableMemory.set((long) totalMemory);
totalAllocatedMemory.set((long) usedMemory);
memoryUtilization.set((long) (usedMemory * 100.0 / totalMemory));
DRU = Math.max(DRU, usedMemory * 100.0 / totalMemory);
totalAvailableNwMbps.set((long) totalNwMbps);
totalAllocatedNwMbps.set((long) usedNwMbps);
networkUtilization.set((long) (usedNwMbps * 100.0 / totalNwMbps));
DRU = Math.max(DRU, usedNwMbps * 100.0 / totalNwMbps);
dominantResUtilization.set((long) DRU);
}
use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.
the class MesosSchedulerCallbackHandler method statusUpdate.
@Override
public void statusUpdate(final SchedulerDriver arg0, TaskStatus arg1) {
Optional<WorkerId> workerIdO = WorkerId.fromId(arg1.getTaskId().getValue());
logger.debug("Task status update: ({}) state: {}({}) - {}", arg1.getTaskId().getValue(), arg1.getState(), arg1.getState().getNumber(), arg1.getMessage());
if (workerIdO.isPresent()) {
WorkerId workerId = workerIdO.get();
VMResourceState state;
String mesg = "Mesos task " + arg1.getState() + "-" + arg1.getMessage();
switch(arg1.getState()) {
case TASK_FAILED:
case TASK_LOST:
state = VMResourceState.FAILED;
break;
case TASK_FINISHED:
state = VMResourceState.COMPLETED;
break;
case TASK_RUNNING:
state = VMResourceState.STARTED;
break;
case TASK_STAGING:
case TASK_STARTING:
state = VMResourceState.START_INITIATED;
break;
default:
logger.warn("Unexpected Mesos task state " + arg1.getState());
return;
}
jobMessageRouter.routeWorkerEvent(new WorkerResourceStatus(workerId, mesg, state));
} else {
logger.error("Failed to parse workerId from Mesos task update {}", arg1.getTaskId().getValue());
}
}
use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.
the class MesosSchedulerCallbackHandler method reconcileTasksKnownToUs.
private void reconcileTasksKnownToUs(SchedulerDriver driver) {
final List<TaskStatus> tasksToInitialize = new ArrayList<>();
for (Map.Entry<WorkerId, String> workerIdSlaveId : workerRegistry.getAllRunningWorkerSlaveIdMappings().entrySet()) {
final WorkerId workerId = workerIdSlaveId.getKey();
final String slaveId = workerIdSlaveId.getValue();
if (logger.isDebugEnabled()) {
logger.debug("reconcile running worker mapping {} -> {}", workerId.getId(), slaveId);
}
tasksToInitialize.add(TaskStatus.newBuilder().setTaskId(Protos.TaskID.newBuilder().setValue(workerId.getId()).build()).setState(Protos.TaskState.TASK_RUNNING).setSlaveId(SlaveID.newBuilder().setValue(slaveId).build()).build());
}
if (!tasksToInitialize.isEmpty()) {
Protos.Status status = driver.reconcileTasks(tasksToInitialize);
numReconcileTasks.increment();
logger.info("Sent request to reconcile " + tasksToInitialize.size() + " tasks, status=" + status);
logger.info("Last offer received " + (System.currentTimeMillis() - lastOfferReceivedAt.get()) / 1000 + " secs ago");
logger.info("Last valid offer received " + (System.currentTimeMillis() - lastValidOfferReceivedAt.get()) / 1000 + " secs ago");
switch(status) {
case DRIVER_ABORTED:
case DRIVER_STOPPED:
logger.error("Unexpected to see Mesos driver status of " + status + " from reconcile request. Committing suicide!");
System.exit(2);
}
}
}
Aggregations