Search in sources :

Example 1 with WorkerId

use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.

the class WorkerRegistryV2Test method testIsWorkerValid.

@Test
public void testIsWorkerValid() {
    JobId jId = new JobId("testIsWorkerValid", 1);
    WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
    initRegistryWithWorkers(workerRegistryV2, "testIsWorkerValid-1", 5);
    for (int i = 0; i < 5; i++) {
        assertTrue(workerRegistryV2.isWorkerValid(new WorkerId(jId.getId(), i, i + 5)));
    }
}
Also used : WorkerId(io.mantisrx.server.core.domain.WorkerId) JobId(io.mantisrx.server.master.domain.JobId) Test(org.junit.Test)

Example 2 with WorkerId

use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.

the class WorkerRegistryV2Test method testJobScaleUp.

@Test
public void testJobScaleUp() throws Exception, InvalidJobException, io.mantisrx.runtime.command.InvalidJobException {
    WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
    LifecycleEventPublisher eventPublisher = new LifecycleEventPublisherImpl(new AuditEventSubscriberLoggingImpl(), new StatusEventSubscriberLoggingImpl(), new DummyWorkerEventSubscriberImpl(workerRegistryV2));
    Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
    smap.put(StageScalingPolicy.ScalingReason.CPU, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.CPU, 0.5, 0.75, null));
    smap.put(StageScalingPolicy.ScalingReason.DataDrop, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.DataDrop, 0.0, 2.0, null));
    SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerScalableStageWithConstraints(1, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList(), new StageScalingPolicy(1, 0, 10, 1, 1, 0, smap)).build();
    String clusterName = "testJobScaleUp";
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    MantisJobStore jobStoreMock = mock(MantisJobStore.class);
    ActorRef jobActor = JobTestHelper.submitSingleStageScalableJob(system, probe, clusterName, sInfo, schedulerMock, jobStoreMock, eventPublisher);
    assertEquals(2, workerRegistryV2.getNumRunningWorkers());
    // send scale up request
    jobActor.tell(new JobClusterManagerProto.ScaleStageRequest(clusterName + "-1", 1, 2, "", ""), probe.getRef());
    JobClusterManagerProto.ScaleStageResponse scaleResp = probe.expectMsgClass(JobClusterManagerProto.ScaleStageResponse.class);
    System.out.println("ScaleupResp " + scaleResp.message);
    assertEquals(SUCCESS, scaleResp.responseCode);
    assertEquals(2, scaleResp.getActualNumWorkers());
    JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, clusterName + "-1", 0, new WorkerId(clusterName + "-1", 1, 3));
    jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("user", new JobId(clusterName, 1)), probe.getRef());
    JobClusterManagerProto.GetJobDetailsResponse resp = probe.expectMsgClass(JobClusterManagerProto.GetJobDetailsResponse.class);
    Map<Integer, ? extends IMantisStageMetadata> stageMetadata = resp.getJobMetadata().get().getStageMetadata();
    assertEquals(2, stageMetadata.get(1).getAllWorkers().size());
    int cnt = 0;
    for (int i = 0; i < 50; i++) {
        cnt++;
        if (workerRegistryV2.getNumRunningWorkers() == 3) {
            break;
        }
    }
    assertTrue(cnt < 50);
}
Also used : ActorRef(akka.actor.ActorRef) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) SchedulingInfo(io.mantisrx.runtime.descriptor.SchedulingInfo) MachineDefinition(io.mantisrx.runtime.MachineDefinition) WorkerId(io.mantisrx.server.core.domain.WorkerId) StageScalingPolicy(io.mantisrx.runtime.descriptor.StageScalingPolicy) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) Test(org.junit.Test)

Example 3 with WorkerId

use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.

the class SchedulingService method verifyAndReportResUsageMetrics.

private void verifyAndReportResUsageMetrics(List<VirtualMachineCurrentState> vmCurrentStates) {
    double totalCPU = 0.0;
    double usedCPU = 0.0;
    double totalMemory = 0.0;
    double usedMemory = 0.0;
    double totalNwMbps = 0.0;
    double usedNwMbps = 0.0;
    for (VirtualMachineCurrentState state : vmCurrentStates) {
        final VirtualMachineLease currAvailableResources = state.getCurrAvailableResources();
        if (currAvailableResources != null) {
            totalCPU += currAvailableResources.cpuCores();
            totalMemory += currAvailableResources.memoryMB();
            totalNwMbps += currAvailableResources.networkMbps();
        }
        final Collection<TaskRequest> runningTasks = state.getRunningTasks();
        if (runningTasks != null) {
            for (TaskRequest t : runningTasks) {
                Optional<WorkerId> workerId = WorkerId.fromId(t.getId());
                if (!workerId.isPresent() || !workerRegistry.isWorkerValid(workerId.get())) {
                    taskSchedulingService.removeTask(t.getId(), DEFAULT_Q_ATTRIBUTES, state.getHostname());
                } else {
                    usedCPU += t.getCPUs();
                    totalCPU += t.getCPUs();
                    usedMemory += t.getMemory();
                    totalMemory += t.getMemory();
                    usedNwMbps += t.getNetworkMbps();
                    totalNwMbps += t.getNetworkMbps();
                }
            }
        }
    }
    totalAvailableCPUs.set((long) totalCPU);
    totalAllocatedCPUs.set((long) usedCPU);
    cpuUtilization.set((long) (usedCPU * 100.0 / totalCPU));
    double DRU = usedCPU * 100.0 / totalCPU;
    totalAvailableMemory.set((long) totalMemory);
    totalAllocatedMemory.set((long) usedMemory);
    memoryUtilization.set((long) (usedMemory * 100.0 / totalMemory));
    DRU = Math.max(DRU, usedMemory * 100.0 / totalMemory);
    totalAvailableNwMbps.set((long) totalNwMbps);
    totalAllocatedNwMbps.set((long) usedNwMbps);
    networkUtilization.set((long) (usedNwMbps * 100.0 / totalNwMbps));
    DRU = Math.max(DRU, usedNwMbps * 100.0 / totalNwMbps);
    dominantResUtilization.set((long) DRU);
}
Also used : VirtualMachineCurrentState(com.netflix.fenzo.VirtualMachineCurrentState) TaskRequest(com.netflix.fenzo.TaskRequest) LaunchTaskRequest(io.mantisrx.server.master.scheduler.LaunchTaskRequest) VirtualMachineLease(com.netflix.fenzo.VirtualMachineLease) WorkerId(io.mantisrx.server.core.domain.WorkerId)

Example 4 with WorkerId

use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.

the class MesosSchedulerCallbackHandler method statusUpdate.

@Override
public void statusUpdate(final SchedulerDriver arg0, TaskStatus arg1) {
    Optional<WorkerId> workerIdO = WorkerId.fromId(arg1.getTaskId().getValue());
    logger.debug("Task status update: ({}) state: {}({}) - {}", arg1.getTaskId().getValue(), arg1.getState(), arg1.getState().getNumber(), arg1.getMessage());
    if (workerIdO.isPresent()) {
        WorkerId workerId = workerIdO.get();
        VMResourceState state;
        String mesg = "Mesos task " + arg1.getState() + "-" + arg1.getMessage();
        switch(arg1.getState()) {
            case TASK_FAILED:
            case TASK_LOST:
                state = VMResourceState.FAILED;
                break;
            case TASK_FINISHED:
                state = VMResourceState.COMPLETED;
                break;
            case TASK_RUNNING:
                state = VMResourceState.STARTED;
                break;
            case TASK_STAGING:
            case TASK_STARTING:
                state = VMResourceState.START_INITIATED;
                break;
            default:
                logger.warn("Unexpected Mesos task state " + arg1.getState());
                return;
        }
        jobMessageRouter.routeWorkerEvent(new WorkerResourceStatus(workerId, mesg, state));
    } else {
        logger.error("Failed to parse workerId from Mesos task update {}", arg1.getTaskId().getValue());
    }
}
Also used : VMResourceState(io.mantisrx.server.master.scheduler.WorkerResourceStatus.VMResourceState) WorkerId(io.mantisrx.server.core.domain.WorkerId) WorkerResourceStatus(io.mantisrx.server.master.scheduler.WorkerResourceStatus)

Example 5 with WorkerId

use of io.mantisrx.server.core.domain.WorkerId in project mantis by Netflix.

the class MesosSchedulerCallbackHandler method reconcileTasksKnownToUs.

private void reconcileTasksKnownToUs(SchedulerDriver driver) {
    final List<TaskStatus> tasksToInitialize = new ArrayList<>();
    for (Map.Entry<WorkerId, String> workerIdSlaveId : workerRegistry.getAllRunningWorkerSlaveIdMappings().entrySet()) {
        final WorkerId workerId = workerIdSlaveId.getKey();
        final String slaveId = workerIdSlaveId.getValue();
        if (logger.isDebugEnabled()) {
            logger.debug("reconcile running worker mapping {} -> {}", workerId.getId(), slaveId);
        }
        tasksToInitialize.add(TaskStatus.newBuilder().setTaskId(Protos.TaskID.newBuilder().setValue(workerId.getId()).build()).setState(Protos.TaskState.TASK_RUNNING).setSlaveId(SlaveID.newBuilder().setValue(slaveId).build()).build());
    }
    if (!tasksToInitialize.isEmpty()) {
        Protos.Status status = driver.reconcileTasks(tasksToInitialize);
        numReconcileTasks.increment();
        logger.info("Sent request to reconcile " + tasksToInitialize.size() + " tasks, status=" + status);
        logger.info("Last offer received " + (System.currentTimeMillis() - lastOfferReceivedAt.get()) / 1000 + " secs ago");
        logger.info("Last valid offer received " + (System.currentTimeMillis() - lastValidOfferReceivedAt.get()) / 1000 + " secs ago");
        switch(status) {
            case DRIVER_ABORTED:
            case DRIVER_STOPPED:
                logger.error("Unexpected to see Mesos driver status of " + status + " from reconcile request. Committing suicide!");
                System.exit(2);
        }
    }
}
Also used : Protos(org.apache.mesos.Protos) ArrayList(java.util.ArrayList) TaskStatus(org.apache.mesos.Protos.TaskStatus) WorkerId(io.mantisrx.server.core.domain.WorkerId) Map(java.util.Map)

Aggregations

WorkerId (io.mantisrx.server.core.domain.WorkerId)48 Test (org.junit.Test)36 ActorRef (akka.actor.ActorRef)32 TestKit (akka.testkit.javadsl.TestKit)32 MantisScheduler (io.mantisrx.server.master.scheduler.MantisScheduler)32 MantisJobStore (io.mantisrx.server.master.persistence.MantisJobStore)30 InvalidJobException (io.mantisrx.runtime.command.InvalidJobException)25 JobClusterProto (io.mantisrx.master.jobcluster.proto.JobClusterProto)20 Matchers.anyString (org.mockito.Matchers.anyString)18 JobClusterManagerProto (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto)16 GetJobDetailsResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse)15 JobId (io.mantisrx.server.master.domain.JobId)15 SchedulingInfo (io.mantisrx.runtime.descriptor.SchedulingInfo)11 JobProto (io.mantisrx.master.jobcluster.proto.JobProto)10 IJobClusterDefinition (io.mantisrx.server.master.domain.IJobClusterDefinition)10 JobDefinition (io.mantisrx.server.master.domain.JobDefinition)10 IOException (java.io.IOException)10 GetJobDetailsRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest)8 MachineDefinition (io.mantisrx.runtime.MachineDefinition)8 StageSchedulingInfo (io.mantisrx.runtime.descriptor.StageSchedulingInfo)6