use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.
the class JobClusterManagerTest method testBootStrapJobClustersAndJobs.
@Test
public void testBootStrapJobClustersAndJobs() {
TestKit probe = new TestKit(system);
JobTestHelper.deleteAllFiles();
MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
JobClustersManagerInitializeResponse iResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
List<String> clusterNames = Lists.newArrayList("testBootStrapJobClustersAndJobs1", "testBootStrapJobClustersAndJobs2", "testBootStrapJobClustersAndJobs3");
String clusterWithNoJob = "testBootStrapJobClusterWithNoJob";
createJobClusterAndAssert(jobClusterManagerActor, clusterWithNoJob);
WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
// Create 3 clusters and submit 1 job each
for (String cluster : clusterNames) {
createJobClusterAndAssert(jobClusterManagerActor, cluster, migrationConfig);
submitJobAndAssert(jobClusterManagerActor, cluster);
if (cluster.equals("testBootStrapJobClustersAndJobs1")) {
// send worker events for job 1 so it goes to started state
String jobId = "testBootStrapJobClustersAndJobs1-1";
WorkerId workerId = new WorkerId(jobId, 0, 1);
WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobClusterManagerActor.tell(launchedEvent, probe.getRef());
WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
jobClusterManagerActor.tell(startInitEvent, probe.getRef());
WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
jobClusterManagerActor.tell(heartBeat, probe.getRef());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
}
}
// kill 1 of the jobs to test archive path
JobClusterManagerProto.KillJobRequest killRequest = new JobClusterManagerProto.KillJobRequest("testBootStrapJobClustersAndJobs2-1", JobCompletedReason.Killed.toString(), "njoshi");
jobClusterManagerActor.tell(killRequest, probe.getRef());
JobClusterManagerProto.KillJobResponse killJobResponse = probe.expectMsgClass(JobClusterManagerProto.KillJobResponse.class);
assertEquals(SUCCESS, killJobResponse.responseCode);
JobTestHelper.sendWorkerTerminatedEvent(probe, jobClusterManagerActor, "testBootStrapJobClustersAndJobs2-1", new WorkerId("testBootStrapJobClustersAndJobs2-1", 0, 1));
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Stop job cluster Manager Actor
system.stop(jobClusterManagerActor);
// create new instance
jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
// initialize it
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES),JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
assertEquals(SUCCESS, initializeResponse.responseCode);
// Get Cluster Config
jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, clusterResponse.responseCode);
assertTrue(clusterResponse.getJobCluster().isPresent());
WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
System.out.println("Resp2 -> " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
// 1 jobs should be in completed state
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs2-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its completed
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Completed, resp2.getJobMetadata().get().getState());
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs3-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its Accepted
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
try {
Optional<JobWorker> workerByIndex = resp2.getJobMetadata().get().getWorkerByIndex(1, 0);
assertTrue(workerByIndex.isPresent());
Optional<IMantisStageMetadata> stageMetadata = resp2.getJobMetadata().get().getStageMetadata(1);
assertTrue(stageMetadata.isPresent());
JobWorker workerByIndex1 = stageMetadata.get().getWorkerByIndex(0);
System.out.println("Got worker by index : " + workerByIndex1);
Optional<JobWorker> worker = resp2.getJobMetadata().get().getWorkerByNumber(1);
assertTrue(worker.isPresent());
} catch (io.mantisrx.server.master.persistence.exceptions.InvalidJobException e) {
e.printStackTrace();
}
jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
});
jobClusterManagerActor.tell(new GetJobClusterRequest(clusterWithNoJob), probe.getRef());
GetJobClusterResponse jobClusterResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobClusterResponse.class);
assertEquals(SUCCESS, jobClusterResponse.responseCode);
assertTrue(jobClusterResponse.getJobCluster().isPresent());
assertEquals(clusterWithNoJob, jobClusterResponse.getJobCluster().get().getName());
// 1 running worker
verify(schedulerMock, timeout(100_1000).times(1)).initializeRunningWorker(any(), any());
// 2 worker schedule requests
verify(schedulerMock, timeout(100_000).times(4)).scheduleWorker(any());
try {
Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
Mockito.verify(jobStoreSpied).loadAllActiveJobs();
Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
Mockito.verify(jobStoreSpied).archiveWorker(any());
Mockito.verify(jobStoreSpied).archiveJob(any());
} catch (IOException e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.
the class JobClusterManagerTest method testJobClusterWorkerMigrationUpdate.
@Test
public void testJobClusterWorkerMigrationUpdate() throws MalformedURLException {
TestKit probe = new TestKit(system);
String clusterName = "testJobClusterWorkerMigrationUpdate";
List<Label> labels = Lists.newLinkedList();
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, labels);
jobClusterManagerActor.tell(new JobClusterManagerProto.CreateJobClusterRequest(fakeJobCluster, "user"), probe.getRef());
JobClusterManagerProto.CreateJobClusterResponse createResp = probe.expectMsgClass(JobClusterManagerProto.CreateJobClusterResponse.class);
assertEquals(SUCCESS_CREATED, createResp.responseCode);
UpdateJobClusterWorkerMigrationStrategyRequest req = new JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest(clusterName, new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{}"), clusterName);
jobClusterManagerActor.tell(req, probe.getRef());
JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse updateResp = probe.expectMsgClass(JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse.class);
assertEquals(SUCCESS, updateResp.responseCode);
jobClusterManagerActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
GetJobClusterResponse getResp = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, getResp.responseCode);
assertEquals(MigrationStrategyEnum.ONE_WORKER, getResp.getJobCluster().get().getMigrationConfig().getStrategy());
}
use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.
the class JobClusterTest method testJobClusterMigrationConfigUpdate.
@Test
public void testJobClusterMigrationConfigUpdate() throws Exception {
TestKit probe = new TestKit(system);
String clusterName = "testJobClusterMigrationConfigUpdate";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
WorkerMigrationConfig newConfig = new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{'name':'value'}");
UpdateJobClusterWorkerMigrationStrategyRequest updateMigrationConfigReq = new UpdateJobClusterWorkerMigrationStrategyRequest(clusterName, newConfig, "user");
jobClusterActor.tell(updateMigrationConfigReq, probe.getRef());
UpdateJobClusterWorkerMigrationStrategyResponse resp = probe.expectMsgClass(UpdateJobClusterWorkerMigrationStrategyResponse.class);
assertEquals(SUCCESS, resp.responseCode);
assertEquals(jobClusterActor, probe.getLastSender());
jobClusterActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
GetJobClusterResponse resp3 = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, resp3.responseCode);
assertTrue(resp3.getJobCluster() != null);
System.out.println("Job cluster " + resp3.getJobCluster());
assertEquals(clusterName, resp3.getJobCluster().get().getName());
System.out.println("Updated job cluster " + resp3.getJobCluster());
assertEquals(MigrationStrategyEnum.ONE_WORKER, resp3.getJobCluster().get().getMigrationConfig().getStrategy());
verify(jobStoreMock, times(1)).updateJobCluster(any());
verify(jobStoreMock, times(1)).createJobCluster(any());
}
use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.
the class JobClusterManagerTest method testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts.
/**
* Case for a master leader re-election when a new master re-hydrates corrupted job worker metadata.
*/
@Test
public void testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts() throws IOException, io.mantisrx.server.master.persistence.exceptions.InvalidJobException {
TestKit probe = new TestKit(system);
JobTestHelper.deleteAllFiles();
MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
String jobClusterName = "testBootStrapJobClustersAndJobs1";
WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
createJobClusterAndAssert(jobClusterManagerActor, jobClusterName, migrationConfig);
submitJobAndAssert(jobClusterManagerActor, jobClusterName);
String jobId = "testBootStrapJobClustersAndJobs1-1";
WorkerId workerId = new WorkerId(jobId, 0, 1);
WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobClusterManagerActor.tell(launchedEvent, probe.getRef());
WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
jobClusterManagerActor.tell(startInitEvent, probe.getRef());
WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
jobClusterManagerActor.tell(heartBeat, probe.getRef());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
assertEquals(SUCCESS, resp2.responseCode);
JobWorker worker = new JobWorker.Builder().withWorkerIndex(0).withWorkerNumber(1).withJobId(jobId).withStageNum(1).withNumberOfPorts(5).withWorkerPorts(null).withState(WorkerState.Started).withLifecycleEventsPublisher(eventPublisher).build();
jobStoreSpied.updateWorker(worker.getMetadata());
// Stop job cluster Manager Actor
system.stop(jobClusterManagerActor);
// create new instance
jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
// initialize it
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
assertEquals(SUCCESS, initializeResponse.responseCode);
WorkerId newWorkerId = new WorkerId(jobId, 0, 11);
launchedEvent = new WorkerLaunched(newWorkerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobClusterManagerActor.tell(launchedEvent, probe.getRef());
// Get Cluster Config
jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, clusterResponse.responseCode);
assertTrue(clusterResponse.getJobCluster().isPresent());
WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
IMantisWorkerMetadata mantisWorkerMetadata = resp2.getJobMetadata().get().getWorkerByIndex(1, 0).get().getMetadata();
assertNotNull(mantisWorkerMetadata.getWorkerPorts());
assertEquals(11, mantisWorkerMetadata.getWorkerNumber());
assertEquals(1, mantisWorkerMetadata.getTotalResubmitCount());
jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
});
// Two schedules: one for the initial success, one for a resubmit from corrupted worker ports.
verify(schedulerMock, times(2)).scheduleWorker(any());
// One unschedule from corrupted worker ID 1 (before the resubmit).
verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(eq(workerId), any());
try {
Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
Mockito.verify(jobStoreSpied).loadAllActiveJobs();
Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
Mockito.verify(jobStoreSpied).archiveWorker(any());
} catch (IOException e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.runtime.WorkerMigrationConfig in project mantis by Netflix.
the class JobTestMigrationTests method testWorkerMigration.
@Test
public void testWorkerMigration() {
String clusterName = "testWorkerMigration";
TestKit probe = new TestKit(system);
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).singleWorkerStageWithConstraints(new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList()).build();
IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName, sInfo, new WorkerMigrationConfig(MigrationStrategyEnum.ONE_WORKER, "{}"));
CountDownLatch scheduleCDL = new CountDownLatch(2);
CountDownLatch unscheduleCDL = new CountDownLatch(1);
JobDefinition jobDefn;
try {
jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
// mock(MantisScheduler.class); //
MantisScheduler schedulerMock = new DummyScheduler(scheduleCDL, unscheduleCDL);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 2)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
assertEquals(SUCCESS, initMsg.responseCode);
String jobId = clusterName + "-2";
int stageNo = 1;
WorkerId workerId = new WorkerId(jobId, 0, 1);
// send Launched, Initiated and heartbeat
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
JobClusterManagerProto.GetJobDetailsResponse resp3 = probe.expectMsgClass(JobClusterManagerProto.GetJobDetailsResponse.class);
assertEquals(SUCCESS, resp3.responseCode);
// worker has started so job should be started.
assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
// Send migrate worker message
jobActor.tell(new WorkerOnDisabledVM(workerId), probe.getRef());
// Trigger check hb status and that should start the migration. And migrate first worker
Instant now = Instant.now();
jobActor.tell(new JobProto.CheckHeartBeat(), probe.getRef());
// send HB for the migrated worker
WorkerId migratedWorkerId1 = new WorkerId(jobId, 0, 2);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, migratedWorkerId1);
// Trigger another check should be noop
// jobActor.tell(new JobProto.CheckHeartBeat(now.plusSeconds(120)), probe.getRef());
scheduleCDL.await(1, TimeUnit.SECONDS);
unscheduleCDL.await(1, TimeUnit.SECONDS);
// // 1 original submissions and 1 resubmit because of migration
// when(schedulerMock.scheduleWorker(any())).
// verify(schedulerMock, times(2)).scheduleWorker(any());
// // // 1 kill due to resubmits
// verify(schedulerMock, times(1)).unscheduleWorker(any(), any());
//
// assertEquals(jobActor, probe.getLastSender());
} catch (InvalidJobException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
Aggregations