Search in sources :

Example 1 with SubmitJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest in project mantis by Netflix.

the class JobClusterActor method onEnforceSLARequest.

@Override
public void onEnforceSLARequest(JobClusterProto.EnforceSLARequest request) {
    if (logger.isTraceEnabled()) {
        logger.trace("Enter onEnforceSLA for JobCluster {} with request", this.name, request);
    }
    numSLAEnforcementExecutions.increment();
    long now = request.timeOfEnforcement.toEpochMilli();
    List<JobInfo> pendingInitializationJobsPriorToCutoff = jobManager.getJobActorsStuckInInit(now, getExpirePendingInitializeDelayMs());
    List<JobInfo> jobsStuckInAcceptedList = jobManager.getJobsStuckInAccepted(now, getExpireAcceptedDelayMs());
    List<JobInfo> jobsStuckInTerminatingList = jobManager.getJobsStuckInTerminating(now, getExpireAcceptedDelayMs());
    if (!slaEnforcer.hasSLA()) {
        return;
    }
    int activeJobsCount = jobManager.activeJobsCount();
    int acceptedJobsCount = jobManager.acceptedJobsCount();
    // enforcing min
    int noOfJobsToLaunch = slaEnforcer.enforceSLAMin(activeJobsCount, acceptedJobsCount);
    if (noOfJobsToLaunch > 0) {
        logger.info("Submitting {} jobs for job name {} as active count is {} and accepted count is {}", noOfJobsToLaunch, name, activeJobsCount, acceptedJobsCount);
        String user = MANTIS_MASTER_USER;
        if (request.jobDefinitionOp.isPresent()) {
            user = request.jobDefinitionOp.get().getUser();
        }
        for (int i = 0; i < noOfJobsToLaunch; i++) {
            getSelf().tell(new SubmitJobRequest(name, user, true, request.jobDefinitionOp), getSelf());
        }
    // enforce max.
    } else {
        List<JobInfo> listOfJobs = new ArrayList<>(activeJobsCount + acceptedJobsCount);
        listOfJobs.addAll(jobManager.getActiveJobsList());
        listOfJobs.addAll(jobManager.getAcceptedJobsList());
        List<JobId> jobsToKill = slaEnforcer.enforceSLAMax(Collections.unmodifiableList(listOfJobs));
        for (JobId jobId : jobsToKill) {
            logger.info("Request termination for job {}", jobId);
            getSelf().tell(new KillJobRequest(jobId, "SLA enforcement", JobCompletedReason.Killed, MANTIS_MASTER_USER, ActorRef.noSender()), getSelf());
        }
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Exit onEnforceSLA for JobCluster {}", name);
    }
}
Also used : KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) ArrayList(java.util.ArrayList) JobId(io.mantisrx.server.master.domain.JobId)

Example 2 with SubmitJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest in project mantis by Netflix.

the class JobClusterTest method testJobSubmitFails.

@Test
public void testJobSubmitFails() {
    TestKit probe = new TestKit(system);
    try {
        String clusterName = "testJobSubmitFails";
        MantisScheduler schedulerMock = mock(MantisScheduler.class);
        MantisJobStore jobStoreMock = mock(MantisJobStore.class);
        final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
        Mockito.doThrow(Exception.class).when(jobStoreMock).storeNewJob(any());
        ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
        jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
        JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
        assertEquals(SUCCESS, createResp.responseCode);
        final JobDefinition jobDefn = createJob(clusterName, 1, MantisJobDurationType.Transient);
        String jobId = clusterName + "-1";
        jobClusterActor.tell(new SubmitJobRequest(clusterName, "user", Optional.ofNullable(jobDefn)), probe.getRef());
        SubmitJobResponse submitResponse = probe.expectMsgClass(SubmitJobResponse.class);
        assertEquals(SERVER_ERROR, submitResponse.responseCode);
        verify(jobStoreMock, times(1)).createJobCluster(any());
        verify(jobStoreMock, times(1)).updateJobCluster(any());
        verify(jobStoreMock, times(0)).storeNewWorker(any());
        verify(jobStoreMock, times(0)).storeNewWorkers(any(), any());
    } catch (Exception e) {
        fail();
    }
}
Also used : JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) ActorRef(akka.actor.ActorRef) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) TestKit(akka.testkit.javadsl.TestKit) Matchers.anyString(org.mockito.Matchers.anyString) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) SubmitJobResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse) Test(org.junit.Test)

Example 3 with SubmitJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest in project mantis by Netflix.

the class JobClusterActor method onJobClusterUpdateArtifact.

@Override
public void onJobClusterUpdateArtifact(UpdateJobClusterArtifactRequest artifactReq) {
    if (logger.isTraceEnabled()) {
        logger.trace("Entering JobClusterActor:onJobClusterUpdateArtifact");
    }
    ActorRef sender = getSender();
    try {
        if (!isVersionUnique(artifactReq.getVersion(), jobClusterMetadata.getJobClusterDefinition().getJobClusterConfigs())) {
            String msg = String.format("job cluster %s not updated as the version %s is not unique", name, artifactReq.getVersion());
            logger.error(msg);
            sender.tell(new UpdateJobClusterArtifactResponse(artifactReq.requestId, CLIENT_ERROR, msg), getSelf());
            return;
        }
        JobClusterConfig newConfig = new JobClusterConfig.Builder().from(jobClusterMetadata.getJobClusterDefinition().getJobClusterConfig()).withArtifactName(artifactReq.getArtifactName()).withVersion(artifactReq.getVersion()).withUploadedAt(System.currentTimeMillis()).build();
        JobClusterDefinitionImpl updatedDefn = new JobClusterDefinitionImpl.Builder().from(jobClusterMetadata.getJobClusterDefinition()).withJobClusterConfig(newConfig).build();
        IJobClusterMetadata jobCluster = new JobClusterMetadataImpl.Builder().withIsDisabled(jobClusterMetadata.isDisabled()).withLastJobCount(jobClusterMetadata.getLastJobCount()).withJobClusterDefinition(updatedDefn).build();
        updateAndSaveJobCluster(jobCluster);
        sender.tell(new UpdateJobClusterArtifactResponse(artifactReq.requestId, SUCCESS, name + " artifact updated"), getSelf());
        eventPublisher.publishAuditEvent(new LifecycleEventsProto.AuditEvent(LifecycleEventsProto.AuditEvent.AuditEventType.JOB_CLUSTER_UPDATE, jobClusterMetadata.getJobClusterDefinition().getName(), name + " artifact update"));
        if (!artifactReq.isSkipSubmit()) {
            getSelf().tell(new SubmitJobRequest(name, artifactReq.getUser(), (empty())), getSelf());
        }
    } catch (Exception e) {
        logger.error("job cluster not updated ", e);
        sender.tell(new UpdateJobClusterArtifactResponse(artifactReq.requestId, SERVER_ERROR, name + " Job cluster artifact updation failed " + e.getMessage()), getSelf());
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Exit JobClusterActor:onJobClusterUpdateArtifact");
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobClusterConfig(io.mantisrx.server.master.domain.JobClusterConfig) JobClusterDefinitionImpl(io.mantisrx.server.master.domain.JobClusterDefinitionImpl) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) UpdateJobClusterArtifactResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactResponse) TriggerNotFoundException(com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException) SchedulerException(com.netflix.fenzo.triggers.exceptions.SchedulerException) JobClusterAlreadyExistsException(io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException) LifecycleEventsProto(io.mantisrx.master.events.LifecycleEventsProto)

Example 4 with SubmitJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest in project mantis by Netflix.

the class JobClusterTest method testJobSubmitWithUnique.

@Test
public void testJobSubmitWithUnique() {
    TestKit probe = new TestKit(system);
    String clusterName = "testJobSubmitWithUnique";
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    MantisJobStore jobStoreMock = mock(MantisJobStore.class);
    final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
    ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
    jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
    JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
    assertEquals(SUCCESS, createResp.responseCode);
    try {
        final JobDefinition jobDefn = createJob(clusterName, 1, MantisJobDurationType.Transient, "mytype");
        String jobId = clusterName + "-1";
        JobTestHelper.submitJobAndVerifySuccess(probe, clusterName, jobClusterActor, jobDefn, jobId);
        JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Accepted);
        JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 1, new WorkerId(jobId, 0, 1));
        JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Launched);
        jobClusterActor.tell(new SubmitJobRequest(clusterName, "user", Optional.ofNullable(jobDefn)), probe.getRef());
        SubmitJobResponse submitResponse = probe.expectMsgClass(SubmitJobResponse.class);
        // Get the same job id back
        assertTrue(submitResponse.getJobId().isPresent());
        assertEquals(jobId, submitResponse.getJobId().get().getId());
        JobTestHelper.killJobAndVerify(probe, clusterName, new JobId(clusterName, 1), jobClusterActor);
        verify(jobStoreMock, times(1)).createJobCluster(any());
        verify(jobStoreMock, times(1)).updateJobCluster(any());
        verify(jobStoreMock, times(1)).storeNewJob(any());
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        fail();
    }
}
Also used : JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) ActorRef(akka.actor.ActorRef) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) TestKit(akka.testkit.javadsl.TestKit) Matchers.anyString(org.mockito.Matchers.anyString) WorkerId(io.mantisrx.server.core.domain.WorkerId) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) SubmitJobResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse) Test(org.junit.Test)

Example 5 with SubmitJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest in project mantis by Netflix.

the class JobClusterTest method testUpdateJobClusterArtifactWithAutoSubmit.

@Test
public void testUpdateJobClusterArtifactWithAutoSubmit() {
    TestKit probe = new TestKit(system);
    try {
        String clusterName = "testUpdateJobClusterArtifactWithAutoSubmit";
        MantisScheduler schedulerMock = mock(MantisScheduler.class);
        MantisJobStore jobStoreMock = mock(MantisJobStore.class);
        // when running concurrently with testGetJobDetailsForArchivedJob the following mock return is needed to avoid null pointer exception.
        when(jobStoreMock.getArchivedJob(anyString())).thenReturn(empty());
        SLA sla = new SLA(1, 1, null, null);
        final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, Lists.newArrayList(), sla);
        ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
        jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
        JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
        assertEquals(SUCCESS, createResp.responseCode);
        // submit job with different scheduling info instance count compared to cluster default one.
        final int job1InstanceCnt = 3;
        final JobDefinition jobDefn = createJob(clusterName, MantisJobDurationType.Transient, new SchedulingInfo.Builder().numberOfStages(1).addStage(fakeJobCluster.getJobClusterConfig().getSchedulingInfo().forStage(1).toBuilder().numberOfInstances(job1InstanceCnt).build()).build());
        String jobId = clusterName + "-1";
        jobClusterActor.tell(new SubmitJobRequest(clusterName, "user", Optional.ofNullable(jobDefn)), probe.getRef());
        SubmitJobResponse submitResponse = probe.expectMsgClass(SubmitJobResponse.class);
        JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 1, new WorkerId(clusterName, jobId, 0, 1));
        JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, BaseResponse.ResponseCode.SUCCESS, JobState.Accepted);
        // Update artifact with skip submit = false
        String artifact = "newartifact.zip";
        String version = "0.0.2";
        jobClusterActor.tell(new UpdateJobClusterArtifactRequest(clusterName, artifact, version, false, user), probe.getRef());
        UpdateJobClusterArtifactResponse resp = probe.expectMsgClass(UpdateJobClusterArtifactResponse.class);
        // ensure new job was launched
        String jobId2 = clusterName + "-2";
        assertTrue(JobTestHelper.verifyJobStatusWithPolling(probe, jobClusterActor, jobId2, JobState.Accepted));
        // send it worker events to move it to started state
        JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId2, 1, new WorkerId(clusterName, jobId2, 0, 1));
        jobClusterActor.tell(new GetJobDetailsRequest("nj", JobId.fromId(jobId2).get()), probe.getRef());
        GetJobDetailsResponse detailsResp = probe.expectMsgClass(Duration.ofSeconds(5), GetJobDetailsResponse.class);
        assertEquals(JobState.Accepted, detailsResp.getJobMetadata().get().getState());
        assertEquals(artifact, detailsResp.getJobMetadata().get().getArtifactName());
        // verify newly launched job inherited instance count from previous job instance.
        AtomicBoolean hasStage = new AtomicBoolean(false);
        detailsResp.getJobMetadata().get().getSchedulingInfo().getStages().forEach((stageId, stageInfo) -> {
            hasStage.set(true);
            assertEquals(job1InstanceCnt, detailsResp.getJobMetadata().get().getSchedulingInfo().forStage(stageId).getNumberOfInstances());
        });
        assertTrue(hasStage.get());
        assertTrue(JobTestHelper.verifyJobStatusWithPolling(probe, jobClusterActor, jobId2, JobState.Accepted));
    } catch (InvalidJobException e) {
        e.printStackTrace();
    }
}
Also used : JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) ActorRef(akka.actor.ActorRef) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) UpdateJobClusterArtifactResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactResponse) TestKit(akka.testkit.javadsl.TestKit) Matchers.anyString(org.mockito.Matchers.anyString) WorkerId(io.mantisrx.server.core.domain.WorkerId) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) UpdateJobClusterArtifactRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactRequest) SubmitJobResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) Test(org.junit.Test)

Aggregations

SubmitJobRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest)6 ActorRef (akka.actor.ActorRef)5 TestKit (akka.testkit.javadsl.TestKit)4 SubmitJobResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse)4 JobClusterProto (io.mantisrx.master.jobcluster.proto.JobClusterProto)4 InvalidJobException (io.mantisrx.runtime.command.InvalidJobException)4 MantisJobStore (io.mantisrx.server.master.persistence.MantisJobStore)4 MantisScheduler (io.mantisrx.server.master.scheduler.MantisScheduler)4 Test (org.junit.Test)4 Matchers.anyString (org.mockito.Matchers.anyString)4 WorkerId (io.mantisrx.server.core.domain.WorkerId)3 UpdateJobClusterArtifactResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactResponse)2 SchedulerException (com.netflix.fenzo.triggers.exceptions.SchedulerException)1 TriggerNotFoundException (com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException)1 LifecycleEventsProto (io.mantisrx.master.events.LifecycleEventsProto)1 GetJobDetailsRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest)1 GetJobDetailsResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse)1 ScaleStageRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageRequest)1 ScaleStageResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageResponse)1 UpdateJobClusterArtifactRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactRequest)1