Search in sources :

Example 1 with WorkerMigrationConfig

use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.

the class JobClusterManagerTest method testBootStrapJobClustersAndJobs.

@Test
public void testBootStrapJobClustersAndJobs() {
    TestKit probe = new TestKit(system);
    JobTestHelper.deleteAllFiles();
    MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
    MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
    JobClustersManagerInitializeResponse iResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
    List<String> clusterNames = Lists.newArrayList("testBootStrapJobClustersAndJobs1", "testBootStrapJobClustersAndJobs2", "testBootStrapJobClustersAndJobs3");
    String clusterWithNoJob = "testBootStrapJobClusterWithNoJob";
    createJobClusterAndAssert(jobClusterManagerActor, clusterWithNoJob);
    WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
    // Create 3 clusters and submit 1 job each
    for (String cluster : clusterNames) {
        createJobClusterAndAssert(jobClusterManagerActor, cluster, migrationConfig);
        submitJobAndAssert(jobClusterManagerActor, cluster);
        if (cluster.equals("testBootStrapJobClustersAndJobs1")) {
            // send worker events for job 1 so it goes to started state
            String jobId = "testBootStrapJobClustersAndJobs1-1";
            WorkerId workerId = new WorkerId(jobId, 0, 1);
            WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
            jobClusterManagerActor.tell(launchedEvent, probe.getRef());
            WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
            jobClusterManagerActor.tell(startInitEvent, probe.getRef());
            WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
            jobClusterManagerActor.tell(heartBeat, probe.getRef());
            // get Job status
            jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
            GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
            // Ensure its launched
            assertEquals(SUCCESS, resp2.responseCode);
            assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
        }
    }
    // kill 1 of the jobs to test archive path
    JobClusterManagerProto.KillJobRequest killRequest = new JobClusterManagerProto.KillJobRequest("testBootStrapJobClustersAndJobs2-1", JobCompletedReason.Killed.toString(), "njoshi");
    jobClusterManagerActor.tell(killRequest, probe.getRef());
    JobClusterManagerProto.KillJobResponse killJobResponse = probe.expectMsgClass(JobClusterManagerProto.KillJobResponse.class);
    assertEquals(SUCCESS, killJobResponse.responseCode);
    JobTestHelper.sendWorkerTerminatedEvent(probe, jobClusterManagerActor, "testBootStrapJobClustersAndJobs2-1", new WorkerId("testBootStrapJobClustersAndJobs2-1", 0, 1));
    try {
        Thread.sleep(500);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    // Stop job cluster Manager Actor
    system.stop(jobClusterManagerActor);
    // create new instance
    jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    // initialize it
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
    JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
    // probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES),JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
    // probe.expectMsgClass(JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
    assertEquals(SUCCESS, initializeResponse.responseCode);
    // Get Cluster Config
    jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, clusterResponse.responseCode);
    assertTrue(clusterResponse.getJobCluster().isPresent());
    WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
    assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
    assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
    GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    System.out.println("Resp2 -> " + resp2.message);
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
    // 1 jobs should be in completed state
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs2-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
    // Ensure its completed
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Completed, resp2.getJobMetadata().get().getState());
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs3-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
    // Ensure its Accepted
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
    try {
        Optional<JobWorker> workerByIndex = resp2.getJobMetadata().get().getWorkerByIndex(1, 0);
        assertTrue(workerByIndex.isPresent());
        Optional<IMantisStageMetadata> stageMetadata = resp2.getJobMetadata().get().getStageMetadata(1);
        assertTrue(stageMetadata.isPresent());
        JobWorker workerByIndex1 = stageMetadata.get().getWorkerByIndex(0);
        System.out.println("Got worker by index : " + workerByIndex1);
        Optional<JobWorker> worker = resp2.getJobMetadata().get().getWorkerByNumber(1);
        assertTrue(worker.isPresent());
    } catch (io.mantisrx.server.master.persistence.exceptions.InvalidJobException e) {
        e.printStackTrace();
    }
    jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
    lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
        assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
    });
    jobClusterManagerActor.tell(new GetJobClusterRequest(clusterWithNoJob), probe.getRef());
    GetJobClusterResponse jobClusterResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobClusterResponse.class);
    assertEquals(SUCCESS, jobClusterResponse.responseCode);
    assertTrue(jobClusterResponse.getJobCluster().isPresent());
    assertEquals(clusterWithNoJob, jobClusterResponse.getJobCluster().get().getName());
    // 1 running worker
    verify(schedulerMock, timeout(100_1000).times(1)).initializeRunningWorker(any(), any());
    // 2 worker schedule requests
    verify(schedulerMock, timeout(100_000).times(4)).scheduleWorker(any());
    try {
        Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
        Mockito.verify(jobStoreSpied).loadAllActiveJobs();
        Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
        Mockito.verify(jobStoreSpied).archiveWorker(any());
        Mockito.verify(jobStoreSpied).archiveJob(any());
    } catch (IOException e) {
        e.printStackTrace();
        fail();
    }
}
Also used : GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) GetLastSubmittedJobIdStreamResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse) ActorRef(akka.actor.ActorRef) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) WorkerLaunched(io.mantisrx.server.master.scheduler.WorkerLaunched) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) JobClustersManagerInitializeResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse) Status(io.mantisrx.server.core.Status) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) TestKit(akka.testkit.javadsl.TestKit) IOException(java.io.IOException) WorkerId(io.mantisrx.server.core.domain.WorkerId) JobWorker(io.mantisrx.master.jobcluster.job.worker.JobWorker) MantisStorageProviderAdapter(io.mantisrx.server.master.persistence.MantisStorageProviderAdapter) GetLastSubmittedJobIdStreamRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) WorkerPorts(io.mantisrx.common.WorkerPorts) Test(org.junit.Test)

Example 2 with WorkerMigrationConfig

use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.

the class JobClusterManagerTest method testJobClusterWorkerMigrationUpdate.

@Test
public void testJobClusterWorkerMigrationUpdate() throws MalformedURLException {
    TestKit probe = new TestKit(system);
    String clusterName = "testJobClusterWorkerMigrationUpdate";
    List<Label> labels = Lists.newLinkedList();
    final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, labels);
    jobClusterManagerActor.tell(new JobClusterManagerProto.CreateJobClusterRequest(fakeJobCluster, "user"), probe.getRef());
    JobClusterManagerProto.CreateJobClusterResponse createResp = probe.expectMsgClass(JobClusterManagerProto.CreateJobClusterResponse.class);
    assertEquals(SUCCESS_CREATED, createResp.responseCode);
    UpdateJobClusterWorkerMigrationStrategyRequest req = new JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest(clusterName, new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{}"), clusterName);
    jobClusterManagerActor.tell(req, probe.getRef());
    JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse updateResp = probe.expectMsgClass(JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse.class);
    assertEquals(SUCCESS, updateResp.responseCode);
    jobClusterManagerActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
    GetJobClusterResponse getResp = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, getResp.responseCode);
    assertEquals(MigrationStrategyEnum.ONE_WORKER, getResp.getJobCluster().get().getMigrationConfig().getStrategy());
}
Also used : GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) Label(io.mantisrx.common.Label) JobClusterDefinitionImpl(io.mantisrx.server.master.domain.JobClusterDefinitionImpl) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) TestKit(akka.testkit.javadsl.TestKit) UpdateJobClusterWorkerMigrationStrategyRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) Test(org.junit.Test)

Example 3 with WorkerMigrationConfig

use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.

the class JobClusterTest method testJobClusterMigrationConfigUpdate.

@Test
public void testJobClusterMigrationConfigUpdate() throws Exception {
    TestKit probe = new TestKit(system);
    String clusterName = "testJobClusterMigrationConfigUpdate";
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    MantisJobStore jobStoreMock = mock(MantisJobStore.class);
    final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
    ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
    jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
    JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
    assertEquals(SUCCESS, createResp.responseCode);
    WorkerMigrationConfig newConfig = new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{'name':'value'}");
    UpdateJobClusterWorkerMigrationStrategyRequest updateMigrationConfigReq = new UpdateJobClusterWorkerMigrationStrategyRequest(clusterName, newConfig, "user");
    jobClusterActor.tell(updateMigrationConfigReq, probe.getRef());
    UpdateJobClusterWorkerMigrationStrategyResponse resp = probe.expectMsgClass(UpdateJobClusterWorkerMigrationStrategyResponse.class);
    assertEquals(SUCCESS, resp.responseCode);
    assertEquals(jobClusterActor, probe.getLastSender());
    jobClusterActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
    GetJobClusterResponse resp3 = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, resp3.responseCode);
    assertTrue(resp3.getJobCluster() != null);
    System.out.println("Job cluster " + resp3.getJobCluster());
    assertEquals(clusterName, resp3.getJobCluster().get().getName());
    System.out.println("Updated job cluster " + resp3.getJobCluster());
    assertEquals(MigrationStrategyEnum.ONE_WORKER, resp3.getJobCluster().get().getMigrationConfig().getStrategy());
    verify(jobStoreMock, times(1)).updateJobCluster(any());
    verify(jobStoreMock, times(1)).createJobCluster(any());
}
Also used : JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) UpdateJobClusterWorkerMigrationStrategyResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse) ActorRef(akka.actor.ActorRef) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) TestKit(akka.testkit.javadsl.TestKit) Matchers.anyString(org.mockito.Matchers.anyString) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) UpdateJobClusterWorkerMigrationStrategyRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) Test(org.junit.Test)

Example 4 with WorkerMigrationConfig

use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.

the class JobClusterManagerTest method testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts.

/**
 * Case for a master leader re-election when a new master re-hydrates corrupted job worker metadata.
 */
@Test
public void testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts() throws IOException, io.mantisrx.server.master.persistence.exceptions.InvalidJobException {
    TestKit probe = new TestKit(system);
    JobTestHelper.deleteAllFiles();
    MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
    MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
    probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
    String jobClusterName = "testBootStrapJobClustersAndJobs1";
    WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
    createJobClusterAndAssert(jobClusterManagerActor, jobClusterName, migrationConfig);
    submitJobAndAssert(jobClusterManagerActor, jobClusterName);
    String jobId = "testBootStrapJobClustersAndJobs1-1";
    WorkerId workerId = new WorkerId(jobId, 0, 1);
    WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
    jobClusterManagerActor.tell(launchedEvent, probe.getRef());
    WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
    jobClusterManagerActor.tell(startInitEvent, probe.getRef());
    WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
    jobClusterManagerActor.tell(heartBeat, probe.getRef());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
    GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    assertEquals(SUCCESS, resp2.responseCode);
    JobWorker worker = new JobWorker.Builder().withWorkerIndex(0).withWorkerNumber(1).withJobId(jobId).withStageNum(1).withNumberOfPorts(5).withWorkerPorts(null).withState(WorkerState.Started).withLifecycleEventsPublisher(eventPublisher).build();
    jobStoreSpied.updateWorker(worker.getMetadata());
    // Stop job cluster Manager Actor
    system.stop(jobClusterManagerActor);
    // create new instance
    jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    // initialize it
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
    JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
    assertEquals(SUCCESS, initializeResponse.responseCode);
    WorkerId newWorkerId = new WorkerId(jobId, 0, 11);
    launchedEvent = new WorkerLaunched(newWorkerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
    jobClusterManagerActor.tell(launchedEvent, probe.getRef());
    // Get Cluster Config
    jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, clusterResponse.responseCode);
    assertTrue(clusterResponse.getJobCluster().isPresent());
    WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
    assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
    assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
    IMantisWorkerMetadata mantisWorkerMetadata = resp2.getJobMetadata().get().getWorkerByIndex(1, 0).get().getMetadata();
    assertNotNull(mantisWorkerMetadata.getWorkerPorts());
    assertEquals(11, mantisWorkerMetadata.getWorkerNumber());
    assertEquals(1, mantisWorkerMetadata.getTotalResubmitCount());
    jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
    lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
        assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
    });
    // Two schedules: one for the initial success, one for a resubmit from corrupted worker ports.
    verify(schedulerMock, times(2)).scheduleWorker(any());
    // One unschedule from corrupted worker ID 1 (before the resubmit).
    verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(eq(workerId), any());
    try {
        Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
        Mockito.verify(jobStoreSpied).loadAllActiveJobs();
        Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
        Mockito.verify(jobStoreSpied).archiveWorker(any());
    } catch (IOException e) {
        e.printStackTrace();
        fail();
    }
}
Also used : GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) GetLastSubmittedJobIdStreamResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse) ActorRef(akka.actor.ActorRef) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) IMantisWorkerMetadata(io.mantisrx.master.jobcluster.job.worker.IMantisWorkerMetadata) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) WorkerLaunched(io.mantisrx.server.master.scheduler.WorkerLaunched) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) JobClustersManagerInitializeResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse) Status(io.mantisrx.server.core.Status) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) TestKit(akka.testkit.javadsl.TestKit) IOException(java.io.IOException) WorkerId(io.mantisrx.server.core.domain.WorkerId) JobWorker(io.mantisrx.master.jobcluster.job.worker.JobWorker) MantisStorageProviderAdapter(io.mantisrx.server.master.persistence.MantisStorageProviderAdapter) GetLastSubmittedJobIdStreamRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) WorkerPorts(io.mantisrx.common.WorkerPorts) Test(org.junit.Test)

Example 5 with WorkerMigrationConfig

use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.

the class JobTestMigrationTests method testWorkerMigration.

@Test
public void testWorkerMigration() {
    String clusterName = "testWorkerMigration";
    TestKit probe = new TestKit(system);
    SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).singleWorkerStageWithConstraints(new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList()).build();
    IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName, sInfo, new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{}"));
    CountDownLatch scheduleCDL = new CountDownLatch(2);
    CountDownLatch unscheduleCDL = new CountDownLatch(1);
    JobDefinition jobDefn;
    try {
        jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
        // mock(MantisScheduler.class); //
        MantisScheduler schedulerMock = new DummyScheduler(scheduleCDL, unscheduleCDL);
        MantisJobStore jobStoreMock = mock(MantisJobStore.class);
        MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 2)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
        final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
        jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
        JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
        assertEquals(SUCCESS, initMsg.responseCode);
        String jobId = clusterName + "-2";
        int stageNo = 1;
        WorkerId workerId = new WorkerId(jobId, 0, 1);
        // send Launched, Initiated and heartbeat
        JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
        // check job status again
        jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
        JobClusterManagerProto.GetJobDetailsResponse resp3 = probe.expectMsgClass(JobClusterManagerProto.GetJobDetailsResponse.class);
        assertEquals(SUCCESS, resp3.responseCode);
        // worker has started so job should be started.
        assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
        // Send migrate worker message
        jobActor.tell(new WorkerOnDisabledVM(workerId), probe.getRef());
        // Trigger check hb status and that should start the migration. And migrate first worker
        Instant now = Instant.now();
        jobActor.tell(new JobProto.CheckHeartBeat(), probe.getRef());
        // send HB for the migrated worker
        WorkerId migratedWorkerId1 = new WorkerId(jobId, 0, 2);
        JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, migratedWorkerId1);
        // Trigger another check should be noop
        // jobActor.tell(new JobProto.CheckHeartBeat(now.plusSeconds(120)), probe.getRef());
        scheduleCDL.await(1, TimeUnit.SECONDS);
        unscheduleCDL.await(1, TimeUnit.SECONDS);
    // // 1 original submissions and 1 resubmit because of migration
    // when(schedulerMock.scheduleWorker(any())).
    // verify(schedulerMock, times(2)).scheduleWorker(any());
    // //            // 1 kill due to resubmits
    // verify(schedulerMock, times(1)).unscheduleWorker(any(), any());
    // 
    // assertEquals(jobActor, probe.getLastSender());
    } catch (InvalidJobException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        fail();
    } catch (Exception e) {
        e.printStackTrace();
        fail();
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobProto(io.mantisrx.master.jobcluster.proto.JobProto) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerOnDisabledVM(io.mantisrx.server.master.scheduler.WorkerOnDisabledVM) JobDefinition(io.mantisrx.server.master.domain.JobDefinition) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) SchedulingInfo(io.mantisrx.runtime.descriptor.SchedulingInfo) MachineDefinition(io.mantisrx.runtime.MachineDefinition) Instant(java.time.Instant) TestKit(akka.testkit.javadsl.TestKit) CountDownLatch(java.util.concurrent.CountDownLatch) WorkerId(io.mantisrx.server.core.domain.WorkerId) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) IJobClusterDefinition(io.mantisrx.server.master.domain.IJobClusterDefinition) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) Test(org.junit.Test)

Aggregations

TestKit (akka.testkit.javadsl.TestKit)5 WorkerMigrationConfig (io.mantisrx.runtime.WorkerMigrationConfig)5 Test (org.junit.Test)5 ActorRef (akka.actor.ActorRef)4 JobClusterManagerProto (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto)4 GetJobClusterRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest)4 GetJobClusterResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse)4 MantisJobStore (io.mantisrx.server.master.persistence.MantisJobStore)4 MantisScheduler (io.mantisrx.server.master.scheduler.MantisScheduler)4 WorkerId (io.mantisrx.server.core.domain.WorkerId)3 JobId (io.mantisrx.server.master.domain.JobId)3 WorkerPorts (io.mantisrx.common.WorkerPorts)2 JobWorker (io.mantisrx.master.jobcluster.job.worker.JobWorker)2 WorkerHeartbeat (io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat)2 WorkerStatus (io.mantisrx.master.jobcluster.job.worker.WorkerStatus)2 GetJobDetailsRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest)2 GetJobDetailsResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse)2 GetLastSubmittedJobIdStreamRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest)2 GetLastSubmittedJobIdStreamResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse)2 JobClustersManagerInitializeResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse)2