use of io.mantisrx.server.master.scheduler.MantisScheduler in project mantis by Netflix.
the class WorkerRegistryV2Test method testJobScaleUp.
@Test
public void testJobScaleUp() throws Exception, InvalidJobException, io.mantisrx.runtime.command.InvalidJobException {
WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
LifecycleEventPublisher eventPublisher = new LifecycleEventPublisherImpl(new AuditEventSubscriberLoggingImpl(), new StatusEventSubscriberLoggingImpl(), new DummyWorkerEventSubscriberImpl(workerRegistryV2));
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.CPU, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.CPU, 0.5, 0.75, null));
smap.put(StageScalingPolicy.ScalingReason.DataDrop, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.DataDrop, 0.0, 2.0, null));
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerScalableStageWithConstraints(1, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList(), new StageScalingPolicy(1, 0, 10, 1, 1, 0, smap)).build();
String clusterName = "testJobScaleUp";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
ActorRef jobActor = JobTestHelper.submitSingleStageScalableJob(system, probe, clusterName, sInfo, schedulerMock, jobStoreMock, eventPublisher);
assertEquals(2, workerRegistryV2.getNumRunningWorkers());
// send scale up request
jobActor.tell(new JobClusterManagerProto.ScaleStageRequest(clusterName + "-1", 1, 2, "", ""), probe.getRef());
JobClusterManagerProto.ScaleStageResponse scaleResp = probe.expectMsgClass(JobClusterManagerProto.ScaleStageResponse.class);
System.out.println("ScaleupResp " + scaleResp.message);
assertEquals(SUCCESS, scaleResp.responseCode);
assertEquals(2, scaleResp.getActualNumWorkers());
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, clusterName + "-1", 0, new WorkerId(clusterName + "-1", 1, 3));
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("user", new JobId(clusterName, 1)), probe.getRef());
JobClusterManagerProto.GetJobDetailsResponse resp = probe.expectMsgClass(JobClusterManagerProto.GetJobDetailsResponse.class);
Map<Integer, ? extends IMantisStageMetadata> stageMetadata = resp.getJobMetadata().get().getStageMetadata();
assertEquals(2, stageMetadata.get(1).getAllWorkers().size());
int cnt = 0;
for (int i = 0; i < 50; i++) {
cnt++;
if (workerRegistryV2.getNumRunningWorkers() == 3) {
break;
}
}
assertTrue(cnt < 50);
}
use of io.mantisrx.server.master.scheduler.MantisScheduler in project mantis by Netflix.
the class JobClustersManagerActor method initialize.
private void initialize(JobClustersManagerInitialize initMsg) {
ActorRef sender = getSender();
try {
logger.info("In JobClustersManagerActor:initialize");
this.jobListHelperActor = getContext().actorOf(JobListHelperActor.props(), "JobListHelperActor");
getContext().watch(jobListHelperActor);
mantisScheduler = initMsg.getScheduler();
Map<String, IJobClusterMetadata> jobClusterMap = new HashMap<>();
this.jobClusterInfoManager = new JobClusterInfoManager(jobStore, mantisScheduler, eventPublisher);
if (!initMsg.isLoadJobsFromStore()) {
getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SUCCESS, "JobClustersManager successfully inited"), getSelf());
} else {
List<IJobClusterMetadata> jobClusters = jobStore.loadAllJobClusters();
logger.info("Read {} job clusters from storage", jobClusters.size());
List<IMantisJobMetadata> activeJobs = jobStore.loadAllActiveJobs();
logger.info("Read {} jobs from storage", activeJobs.size());
List<CompletedJob> completedJobs = jobStore.loadAllCompletedJobs();
logger.info("Read {} completed jobs from storage", completedJobs.size());
for (IJobClusterMetadata jobClusterMeta : jobClusters) {
String clusterName = jobClusterMeta.getJobClusterDefinition().getName();
jobClusterMap.put(clusterName, jobClusterMeta);
}
Map<String, List<IMantisJobMetadata>> clusterToJobMap = new HashMap<>();
Map<String, List<CompletedJob>> clusterToCompletedJobMap = new HashMap<>();
// group jobs by cluster
for (IMantisJobMetadata jobMeta : activeJobs) {
String clusterName = jobMeta.getClusterName();
clusterToJobMap.computeIfAbsent(clusterName, k -> new ArrayList<>()).add(jobMeta);
}
for (CompletedJob jobMeta : completedJobs) {
String clusterName = jobMeta.getName();
clusterToCompletedJobMap.computeIfAbsent(clusterName, k -> new ArrayList<>()).add(jobMeta);
}
long masterInitTimeoutSecs = ConfigurationProvider.getConfig().getMasterInitTimeoutSecs();
long timeout = ((masterInitTimeoutSecs - 60)) > 0 ? (masterInitTimeoutSecs - 60) : masterInitTimeoutSecs;
Observable.from(jobClusterMap.values()).filter((jobClusterMeta) -> jobClusterMeta != null && jobClusterMeta.getJobClusterDefinition() != null).flatMap((jobClusterMeta) -> {
Duration t = Duration.ofSeconds(timeout);
Optional<JobClusterInfo> jobClusterInfoO = jobClusterInfoManager.createClusterActorAndRegister(jobClusterMeta.getJobClusterDefinition());
if (!jobClusterInfoO.isPresent()) {
logger.info("skipping job cluster {} on bootstrap as actor creating failed", jobClusterMeta.getJobClusterDefinition().getName());
return Observable.empty();
}
JobClusterInfo jobClusterInfo = jobClusterInfoO.get();
List<IMantisJobMetadata> jobList = Lists.newArrayList();
List<IMantisJobMetadata> jList = clusterToJobMap.get(jobClusterMeta.getJobClusterDefinition().getName());
if (jList != null) {
jobList.addAll(jList);
}
List<CompletedJob> completedJobsList = Lists.newArrayList();
List<CompletedJob> cList = clusterToCompletedJobMap.get(jobClusterMeta.getJobClusterDefinition().getName());
if (cList != null) {
completedJobsList.addAll(cList);
}
JobClusterProto.InitializeJobClusterRequest req = new JobClusterProto.InitializeJobClusterRequest((JobClusterDefinitionImpl) jobClusterMeta.getJobClusterDefinition(), jobClusterMeta.isDisabled(), jobClusterMeta.getLastJobCount(), jobList, completedJobsList, "system", getSelf(), false);
return jobClusterInfoManager.initializeCluster(jobClusterInfo, req, t);
}).filter(Objects::nonNull).toBlocking().subscribe((clusterInit) -> {
logger.info("JobCluster {} inited with code {}", clusterInit.jobClusterName, clusterInit.responseCode);
numJobClusterInitSuccesses.increment();
}, (error) -> {
logger.warn("Exception initializing clusters {}", error.getMessage(), error);
logger.error("JobClusterManagerActor had errors during initialization NOT transitioning to initialized behavior");
// getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SERVER_ERROR, "JobClustersManager inited with errors"), getSelf());
}, () -> {
logger.info("JobClusterManagerActor transitioning to initialized behavior");
getContext().become(initializedBehavior);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SUCCESS, "JobClustersManager successfully inited"), getSelf());
});
getTimers().startPeriodicTimer(CHECK_CLUSTERS_TIMER_KEY, new ReconcileJobCluster(), Duration.ofSeconds(checkAgainInSecs));
// kick off loading of archived jobs
logger.info("Kicking off archived job load asynchronously");
jobStore.loadAllArchivedJobsAsync();
}
} catch (Exception e) {
logger.error("caught exception", e);
sender.tell(new JobClustersManagerInitializeResponse(initMsg.requestId, SERVER_ERROR, e.getMessage()), getSelf());
}
logger.info("JobClustersManagerActor:initialize ends");
}
use of io.mantisrx.server.master.scheduler.MantisScheduler in project mantis by Netflix.
the class JobClusterTest method testJobKillTriggersSLAToLaunchNew.
@Test
public void testJobKillTriggersSLAToLaunchNew() {
TestKit probe = new TestKit(system);
String clusterName = "testJobKillTriggersSLAToLaunchNew";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
SLA sla = new SLA(1, 1, null, null);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, Lists.newArrayList(), sla);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
String jobId = clusterName + "-1";
WorkerId workerId1 = new WorkerId(clusterName, jobId, 0, 1);
doAnswer(invocation -> {
WorkerEvent terminate = new WorkerTerminate(workerId1, WorkerState.Completed, JobCompletedReason.Killed, System.currentTimeMillis());
jobClusterActor.tell(terminate, probe.getRef());
return null;
}).when(schedulerMock).unscheduleWorker(any(), any());
try {
final JobDefinition jobDefn = createJob(clusterName, 1, MantisJobDurationType.Transient);
JobId jId = new JobId(clusterName, 1);
JobTestHelper.submitJobAndVerifySuccess(probe, clusterName, jobClusterActor, jobDefn, jobId);
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Accepted);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 1, new WorkerId(clusterName, jobId, 0, 1));
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, SUCCESS, JobState.Launched);
JobTestHelper.killJobAndVerify(probe, clusterName, jId, jobClusterActor);
Thread.sleep(500);
// a new job should have been submitted
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, clusterName + "-2", SUCCESS, JobState.Accepted);
// JobTestHelper.killJobAndVerify(probe, clusterName, new JobId(clusterName, 2), jobClusterActor);
// verify(jobStoreMock, times(1)).createJobCluster(any());
// verify(jobStoreMock, times(1)).updateJobCluster(any());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
}
// Mockito.doThrow(IOException.class).when(jobStoreMock).storeNewJob(any());
}
use of io.mantisrx.server.master.scheduler.MantisScheduler in project mantis by Netflix.
the class JobClusterTest method testZombieWorkerKilledOnMessage.
@Test
public void testZombieWorkerKilledOnMessage() {
String clusterName = "testZombieWorkerKilledOnMessage";
TestKit probe = new TestKit(system);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
try {
String jobId = clusterName + "-1";
WorkerId workerId = new WorkerId(clusterName, jobId, 0, 1);
WorkerEvent heartBeat2 = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started, System.currentTimeMillis()));
jobClusterActor.tell(heartBeat2, probe.getRef());
jobClusterActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
GetJobClusterResponse resp = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(clusterName, resp.getJobCluster().get().getName());
verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(workerId, empty());
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.server.master.scheduler.MantisScheduler in project mantis by Netflix.
the class JobClusterTest method testJobSubmitWithNoSchedInfoUsesJobClusterValues.
@Test
public void testJobSubmitWithNoSchedInfoUsesJobClusterValues() {
TestKit probe = new TestKit(system);
String clusterName = "testJobSubmitWithNoSchedInfoUsesJobClusterValues";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
List<Label> clusterLabels = new ArrayList<>();
Label label = new Label("clabelName", "cLabelValue");
clusterLabels.add(label);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName, clusterLabels);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
try {
final JobDefinition jobDefn = new JobDefinition.Builder().withName(clusterName).withVersion("0.0.1").withSubscriptionTimeoutSecs(0).withUser("njoshi").build();
String jobId = clusterName + "-1";
JobTestHelper.submitJobAndVerifySuccess(probe, clusterName, jobClusterActor, jobDefn, jobId);
jobClusterActor.tell(new GetJobDetailsRequest("nj", JobId.fromId(jobId).get()), probe.getRef());
GetJobDetailsResponse detailsResp = probe.expectMsgClass(GetJobDetailsResponse.class);
assertEquals(SUCCESS, detailsResp.responseCode);
assertEquals(JobState.Accepted, detailsResp.getJobMetadata().get().getState());
//
assertEquals(clusterLabels.size() + LabelManager.numberOfMandatoryLabels(), detailsResp.getJobMetadata().get().getLabels().size());
// confirm that the clusters labels got inherited
assertEquals(1, detailsResp.getJobMetadata().get().getLabels().stream().filter(l -> l.getName().equals("clabelName")).count());
// assertEquals(label, detailsResp.getJobMetadata().get().getLabels().get(0));
// Now submit another one with labels, it should not inherit cluster labels
Label jobLabel = new Label("jobLabel", "jobValue");
List<Label> jobLabelList = new ArrayList<>();
jobLabelList.add(jobLabel);
final JobDefinition jobDefn2 = new JobDefinition.Builder().withName(clusterName).withVersion("0.0.1").withLabels(jobLabelList).withSubscriptionTimeoutSecs(0).withUser("njoshi").build();
String jobId2 = clusterName + "-2";
JobTestHelper.submitJobAndVerifySuccess(probe, clusterName, jobClusterActor, jobDefn2, jobId2);
jobClusterActor.tell(new GetJobDetailsRequest("nj", JobId.fromId(jobId2).get()), probe.getRef());
GetJobDetailsResponse detailsResp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
assertEquals(SUCCESS, detailsResp2.responseCode);
assertEquals(JobState.Accepted, detailsResp2.getJobMetadata().get().getState());
assertEquals(clusterLabels.size() + 2, detailsResp2.getJobMetadata().get().getLabels().size());
// confirm that the clusters labels got inherited
// assertEquals(jobLabel, detailsResp2.getJobMetadata().get().getLabels().get(0));
assertEquals(1, detailsResp2.getJobMetadata().get().getLabels().stream().filter(l -> l.getName().equals(jobLabel.getName())).count());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
}
// Mockito.doThrow(IOException.class).when(jobStoreMock).storeNewJob(any());
}
Aggregations