use of io.mantisrx.server.master.scheduler.WorkerEvent in project mantis by Netflix.
the class JobRoute method getJobRoutes.
private Route getJobRoutes() {
return route(path(STATUS_ENDPOINT, () -> post(() -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
if (logger.isDebugEnabled()) {
logger.debug("/api/postjobstatus called {}", req);
}
try {
workerHeartbeatStatusPOST.increment();
PostJobStatusRequest postJobStatusRequest = Jackson.fromJSON(req, PostJobStatusRequest.class);
WorkerEvent workerStatusRequest = createWorkerStatusRequest(postJobStatusRequest);
if (workerStatusRequest instanceof WorkerHeartbeat) {
if (!ConfigurationProvider.getConfig().isHeartbeatProcessingEnabled()) {
// skip heartbeat processing
if (logger.isTraceEnabled()) {
logger.trace("skipped heartbeat event {}", workerStatusRequest);
}
workerHeartbeatSkipped.increment();
return complete(StatusCodes.OK);
}
}
return completeWithFuture(jobRouteHandler.workerStatus(workerStatusRequest).thenApply(this::toHttpResponse));
} catch (IOException e) {
logger.warn("Error handling job status {}", req, e);
return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid JSON payload to post job status\"}");
}
})))), pathPrefix(API_JOBS, () -> route(post(() -> route(path(KILL_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
logger.debug("/api/jobs/kill called {}", req);
try {
final KillJobRequest killJobRequest = Jackson.fromJSON(req, KillJobRequest.class);
return completeWithFuture(jobRouteHandler.kill(killJobRequest).thenApply(resp -> {
if (resp.responseCode == BaseResponse.ResponseCode.SUCCESS) {
return new JobClusterManagerProto.KillJobResponse(resp.requestId, resp.responseCode, resp.getState(), "[\"" + resp.getJobId().getId() + " Killed\"]", resp.getJobId(), resp.getUser());
} else if (resp.responseCode == BaseResponse.ResponseCode.CLIENT_ERROR) {
// for backwards compatibility with old master
return new JobClusterManagerProto.KillJobResponse(resp.requestId, BaseResponse.ResponseCode.SUCCESS, resp.getState(), "[\"" + resp.message + " \"]", resp.getJobId(), resp.getUser());
}
return resp;
}).thenApply(this::toHttpResponse));
} catch (IOException e) {
logger.warn("Error on job kill {}", req, e);
return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to kill job\"}");
}
}))), path(RESUBMIT_WORKER_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
logger.debug("/api/jobs/resubmitWorker called {}", req);
try {
final ResubmitWorkerRequest resubmitWorkerRequest = Jackson.fromJSON(req, ResubmitWorkerRequest.class);
return completeWithFuture(jobRouteHandler.resubmitWorker(resubmitWorkerRequest).thenApply(this::toHttpResponse));
} catch (IOException e) {
logger.warn("Error on worker resubmit {}", req, e);
return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to resubmit worker\"}");
}
}))), path(SCALE_STAGE_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
logger.debug("/api/jobs/scaleStage called {}", req);
try {
ScaleStageRequest scaleStageRequest = Jackson.fromJSON(req, ScaleStageRequest.class);
int numWorkers = scaleStageRequest.getNumWorkers();
int maxWorkersPerStage = ConfigurationProvider.getConfig().getMaxWorkersPerStage();
if (numWorkers > maxWorkersPerStage) {
logger.warn("rejecting ScaleStageRequest {} with invalid num workers", scaleStageRequest);
return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"num workers must be less than " + maxWorkersPerStage + "\"}");
}
return completeWithFuture(jobRouteHandler.scaleStage(scaleStageRequest).thenApply(this::toHttpResponse));
} catch (IOException e) {
logger.warn("Error scaling stage {}", req, e);
return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to scale stage " + e.getMessage() + "\"}");
}
}))))), get(() -> route(// - optional labels.op query param - default value is 'or' if not specified (other possible value is 'and'
path(segment("list"), () -> {
jobListGET.increment();
return jobListRoute(Optional.empty());
}), path(segment("list").slash("matchinglabels"), () -> {
jobListLabelMatchGET.increment();
return jobListRoute(Optional.empty());
}), path(segment("list").slash(PathMatchers.segment()), (jobId) -> {
logger.debug("/api/jobs/list/{} called", jobId);
jobListJobIdGET.increment();
return completeAsync(jobRouteHandler.getJobDetails(new JobClusterManagerProto.GetJobDetailsRequest("masterAPI", jobId)), resp -> {
Optional<MantisJobMetadataView> mantisJobMetadataView = resp.getJobMetadata().map(metaData -> new MantisJobMetadataView(metaData, Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), false));
return completeOK(mantisJobMetadataView, Jackson.marshaller());
});
}), path(segment("list").slash("matching").slash(PathMatchers.segment()), (regex) -> {
jobListRegexGET.increment();
return jobListRoute(Optional.ofNullable(regex).filter(r -> !r.isEmpty()));
}), path(segment("archived").slash(PathMatchers.segment()), (jobId) -> parameterOptional(StringUnmarshallers.INTEGER, "limit", (limit) -> {
jobArchivedWorkersGET.increment();
Optional<JobId> jobIdO = JobId.fromId(jobId);
if (jobIdO.isPresent()) {
ListArchivedWorkersRequest req = new ListArchivedWorkersRequest(jobIdO.get(), limit.orElse(DEFAULT_LIST_ARCHIVED_WORKERS_LIMIT));
return alwaysCache(cache, requestUriKeyer, () -> extractUri(uri -> completeAsync(jobRouteHandler.listArchivedWorkers(req), resp -> {
List<MantisWorkerMetadataWritable> workers = resp.getWorkerMetadata().stream().map(wm -> DataFormatAdapter.convertMantisWorkerMetadataToMantisWorkerMetadataWritable(wm)).collect(Collectors.toList());
return completeOK(workers, Jackson.marshaller());
})));
} else {
return complete(StatusCodes.BAD_REQUEST, "error: 'archived/<jobId>' request must include a valid jobId");
}
})), path(segment("archived"), () -> {
jobArchivedWorkersGETInvalid.increment();
return complete(StatusCodes.BAD_REQUEST, "error: 'archived' Request must include jobId");
}))))));
}
use of io.mantisrx.server.master.scheduler.WorkerEvent in project mantis by Netflix.
the class JobClusterTest method testJobKillTriggersSLAToLaunchNew.
@Test
public void testJobKillTriggersSLAToLaunchNew() {
TestKit probe = new TestKit(system);
String clusterName = "testJobKillTriggersSLAToLaunchNew";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
SLA sla = new SLA(1, 1, null, null);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, Lists.newArrayList(), sla);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
String jobId = clusterName + "-1";
WorkerId workerId1 = new WorkerId(clusterName, jobId, 0, 1);
doAnswer(invocation -> {
WorkerEvent terminate = new WorkerTerminate(workerId1, WorkerState.Completed, JobCompletedReason.Killed, System.currentTimeMillis());
jobClusterActor.tell(terminate, probe.getRef());
return null;
}).when(schedulerMock).unscheduleWorker(any(), any());
try {
final JobDefinition jobDefn = createJob(clusterName, 1, MantisJobDurationType.Transient);
JobId jId = new JobId(clusterName, 1);
JobTestHelper.submitJobAndVerifySuccess(probe, clusterName, jobClusterActor, jobDefn, jobId);
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Accepted);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 1, new WorkerId(clusterName, jobId, 0, 1));
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Launched);
JobTestHelper.killJobAndVerify(probe, clusterName, jId, jobClusterActor);
Thread.sleep(500);
// a new job should have been submitted
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, clusterName + "-2", SUCCESS, JobState.Accepted);
// JobTestHelper.killJobAndVerify(probe, clusterName, new JobId(clusterName, 2), jobClusterActor);
// verify(jobStoreMock, times(1)).createJobCluster(any());
// verify(jobStoreMock, times(1)).updateJobCluster(any());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
}
// Mockito.doThrow(IOException.class).when(jobStoreMock).storeNewJob(any());
}
use of io.mantisrx.server.master.scheduler.WorkerEvent in project mantis by Netflix.
the class JobClusterTest method testZombieWorkerKilledOnMessage.
@Test
public void testZombieWorkerKilledOnMessage() {
String clusterName = "testZombieWorkerKilledOnMessage";
TestKit probe = new TestKit(system);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
try {
String jobId = clusterName + "-1";
WorkerId workerId = new WorkerId(clusterName, jobId, 0, 1);
WorkerEvent heartBeat2 = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started, System.currentTimeMillis()));
jobClusterActor.tell(heartBeat2, probe.getRef());
jobClusterActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
GetJobClusterResponse resp = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(clusterName, resp.getJobCluster().get().getName());
verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(workerId, empty());
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.server.master.scheduler.WorkerEvent in project mantis by Netflix.
the class JobClusterManagerTest method testBootStrapJobClustersAndJobs.
@Test
public void testBootStrapJobClustersAndJobs() {
TestKit probe = new TestKit(system);
JobTestHelper.deleteAllFiles();
MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
JobClustersManagerInitializeResponse iResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
List<String> clusterNames = Lists.newArrayList("testBootStrapJobClustersAndJobs1", "testBootStrapJobClustersAndJobs2", "testBootStrapJobClustersAndJobs3");
String clusterWithNoJob = "testBootStrapJobClusterWithNoJob";
createJobClusterAndAssert(jobClusterManagerActor, clusterWithNoJob);
WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
// Create 3 clusters and submit 1 job each
for (String cluster : clusterNames) {
createJobClusterAndAssert(jobClusterManagerActor, cluster, migrationConfig);
submitJobAndAssert(jobClusterManagerActor, cluster);
if (cluster.equals("testBootStrapJobClustersAndJobs1")) {
// send worker events for job 1 so it goes to started state
String jobId = "testBootStrapJobClustersAndJobs1-1";
WorkerId workerId = new WorkerId(jobId, 0, 1);
WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobClusterManagerActor.tell(launchedEvent, probe.getRef());
WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
jobClusterManagerActor.tell(startInitEvent, probe.getRef());
WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
jobClusterManagerActor.tell(heartBeat, probe.getRef());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
}
}
// kill 1 of the jobs to test archive path
JobClusterManagerProto.KillJobRequest killRequest = new JobClusterManagerProto.KillJobRequest("testBootStrapJobClustersAndJobs2-1", JobCompletedReason.Killed.toString(), "njoshi");
jobClusterManagerActor.tell(killRequest, probe.getRef());
JobClusterManagerProto.KillJobResponse killJobResponse = probe.expectMsgClass(JobClusterManagerProto.KillJobResponse.class);
assertEquals(SUCCESS, killJobResponse.responseCode);
JobTestHelper.sendWorkerTerminatedEvent(probe, jobClusterManagerActor, "testBootStrapJobClustersAndJobs2-1", new WorkerId("testBootStrapJobClustersAndJobs2-1", 0, 1));
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Stop job cluster Manager Actor
system.stop(jobClusterManagerActor);
// create new instance
jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
// initialize it
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES),JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
assertEquals(SUCCESS, initializeResponse.responseCode);
// Get Cluster Config
jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, clusterResponse.responseCode);
assertTrue(clusterResponse.getJobCluster().isPresent());
WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
System.out.println("Resp2 -> " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
// 1 jobs should be in completed state
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs2-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its completed
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Completed, resp2.getJobMetadata().get().getState());
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs3-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its Accepted
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
try {
Optional<JobWorker> workerByIndex = resp2.getJobMetadata().get().getWorkerByIndex(1, 0);
assertTrue(workerByIndex.isPresent());
Optional<IMantisStageMetadata> stageMetadata = resp2.getJobMetadata().get().getStageMetadata(1);
assertTrue(stageMetadata.isPresent());
JobWorker workerByIndex1 = stageMetadata.get().getWorkerByIndex(0);
System.out.println("Got worker by index : " + workerByIndex1);
Optional<JobWorker> worker = resp2.getJobMetadata().get().getWorkerByNumber(1);
assertTrue(worker.isPresent());
} catch (io.mantisrx.server.master.persistence.exceptions.InvalidJobException e) {
e.printStackTrace();
}
jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
});
jobClusterManagerActor.tell(new GetJobClusterRequest(clusterWithNoJob), probe.getRef());
GetJobClusterResponse jobClusterResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobClusterResponse.class);
assertEquals(SUCCESS, jobClusterResponse.responseCode);
assertTrue(jobClusterResponse.getJobCluster().isPresent());
assertEquals(clusterWithNoJob, jobClusterResponse.getJobCluster().get().getName());
// 1 running worker
verify(schedulerMock, timeout(100_1000).times(1)).initializeRunningWorker(any(), any());
// 2 worker schedule requests
verify(schedulerMock, timeout(100_000).times(4)).scheduleWorker(any());
try {
Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
Mockito.verify(jobStoreSpied).loadAllActiveJobs();
Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
Mockito.verify(jobStoreSpied).archiveWorker(any());
Mockito.verify(jobStoreSpied).archiveJob(any());
} catch (IOException e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.server.master.scheduler.WorkerEvent in project mantis by Netflix.
the class JobTestHelper method sendWorkerTerminatedEvent.
public static void sendWorkerTerminatedEvent(final TestKit probe, final ActorRef jobActor, String jobId, WorkerId workerId2) {
WorkerEvent workerTerminated = new WorkerTerminate(workerId2, WorkerState.Failed, JobCompletedReason.Lost);
jobActor.tell(workerTerminated, probe.getRef());
}
Aggregations