use of io.mantisrx.runtime.descriptor.StageScalingPolicy in project mantis by Netflix.
the class WorkerRegistryV2Test method testJobShutdown.
@Test
public void testJobShutdown() {
WorkerRegistryV2 workerRegistryV2 = new WorkerRegistryV2();
LifecycleEventPublisher eventPublisher = new LifecycleEventPublisherImpl(new AuditEventSubscriberLoggingImpl(), new StatusEventSubscriberLoggingImpl(), new DummyWorkerEventSubscriberImpl(workerRegistryV2));
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.CPU, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.CPU, 0.5, 0.75, null));
smap.put(StageScalingPolicy.ScalingReason.DataDrop, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.DataDrop, 0.0, 2.0, null));
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerScalableStageWithConstraints(1, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList(), new StageScalingPolicy(1, 0, 10, 1, 1, 0, smap)).build();
String clusterName = "testJobShutdown";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
try {
ActorRef jobActor = JobTestHelper.submitSingleStageScalableJob(system, probe, clusterName, sInfo, schedulerMock, jobStoreMock, eventPublisher);
assertEquals(2, workerRegistryV2.getNumRunningWorkers());
jobActor.tell(new JobClusterProto.KillJobRequest(new JobId(clusterName, 1), "test reason", JobCompletedReason.Normal, "nj", probe.getRef()), probe.getRef());
probe.expectMsgClass(JobClusterProto.KillJobResponse.class);
Thread.sleep(1000);
int cnt = 0;
for (int i = 0; i < 100; i++) {
cnt++;
if (workerRegistryV2.getNumRunningWorkers() == 0) {
break;
}
}
assertTrue(cnt < 100);
// assertEquals(0, WorkerRegistryV2.INSTANCE.getNumRunningWorkers());
} catch (Exception e) {
e.printStackTrace();
}
}
use of io.mantisrx.runtime.descriptor.StageScalingPolicy in project mantis by Netflix.
the class JobAutoScalerTest method testScaleDown.
@Test
public void testScaleDown() throws InterruptedException {
final String jobId = "test-job-1";
final int coolDownSec = 2;
final int scalingStageNum = 1;
final MantisMasterClientApi mockMasterClientApi = mock(MantisMasterClientApi.class);
final Map<Integer, StageSchedulingInfo> schedulingInfoMap = new HashMap<>();
final int numStage1Workers = 2;
final int increment = 1;
final int decrement = 1;
final int min = 1;
final int max = 5;
final double scaleUpAbovePct = 45.0;
final double scaleDownBelowPct = 15.0;
final double workerMemoryMB = 512.0;
final StageSchedulingInfo stage1SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numStage1Workers).machineDefinition(new MachineDefinition(2, workerMemoryMB, 200, 1024, 2)).scalingPolicy(new StageScalingPolicy(scalingStageNum, min, max, increment, decrement, coolDownSec, Collections.singletonMap(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, scaleDownBelowPct, scaleUpAbovePct, new StageScalingPolicy.RollingCount(1, 2))))).scalable(true).build();
schedulingInfoMap.put(scalingStageNum, stage1SchedInfo);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers - decrement), anyString())).thenReturn(Observable.just(true));
Context context = mock(Context.class);
when(context.getWorkerMapObservable()).thenReturn(Observable.empty());
final JobAutoScaler jobAutoScaler = new JobAutoScaler(jobId, new SchedulingInfo(schedulingInfoMap), mockMasterClientApi, context);
jobAutoScaler.start();
final Observer<JobAutoScaler.Event> jobAutoScalerObserver = jobAutoScaler.getObserver();
// should trigger a scale down (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers, ""));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers - decrement, String.format("Memory with value %1$,.2f is below scaleDown threshold of %2$,.1f", (scaleDownBelowPct / 100.0 - 0.01) * 100.0, scaleDownBelowPct));
// should *not* trigger a scale down before cooldown period (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
Thread.sleep(coolDownSec * 1000);
if (numStage1Workers - decrement == min) {
// should not trigger a scale down after cooldown period if numWorkers=min (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
verifyNoMoreInteractions(mockMasterClientApi);
}
}
use of io.mantisrx.runtime.descriptor.StageScalingPolicy in project mantis by Netflix.
the class JobAutoScalerTest method testScaleUp.
@Test
public void testScaleUp() throws InterruptedException {
final String jobId = "test-job-1";
final int coolDownSec = 2;
final int scalingStageNum = 1;
final MantisMasterClientApi mockMasterClientApi = mock(MantisMasterClientApi.class);
final Map<Integer, StageSchedulingInfo> schedulingInfoMap = new HashMap<>();
final int numStage1Workers = 1;
final int increment = 1;
final int decrement = 1;
final int min = 1;
final int max = 5;
final double scaleUpAbovePct = 45.0;
final double scaleDownBelowPct = 15.0;
final double workerMemoryMB = 512.0;
final StageSchedulingInfo stage1SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numStage1Workers).machineDefinition(new MachineDefinition(2, workerMemoryMB, 200, 1024, 2)).scalingPolicy(new StageScalingPolicy(scalingStageNum, min, max, increment, decrement, coolDownSec, Collections.singletonMap(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, scaleDownBelowPct, scaleUpAbovePct, new StageScalingPolicy.RollingCount(1, 2))))).scalable(true).build();
schedulingInfoMap.put(scalingStageNum, stage1SchedInfo);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers + increment), anyString())).thenReturn(Observable.just(true));
Context context = mock(Context.class);
when(context.getWorkerMapObservable()).thenReturn(Observable.empty());
final JobAutoScaler jobAutoScaler = new JobAutoScaler(jobId, new SchedulingInfo(schedulingInfoMap), mockMasterClientApi, context);
jobAutoScaler.start();
final Observer<JobAutoScaler.Event> jobAutoScalerObserver = jobAutoScaler.getObserver();
// should trigger a scale up (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers, ""));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers + increment, String.format("Memory with value %1$,.2f exceeded scaleUp threshold of 45.0", (scaleUpAbovePct / 100.0 + 0.01) * 100.0));
// should *not* trigger a scale up before cooldown period (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
Thread.sleep(coolDownSec * 1000);
// retry sending auto scale event till scaleJobStage request sent to master, as there is possible a race between the sleep for coolDownSecs in the Test and the event being processed before coolDownSecs
final CountDownLatch retryLatch = new CountDownLatch(1);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers + 2 * increment), anyString())).thenAnswer(new Answer<Observable<Void>>() {
@Override
public Observable<Void> answer(InvocationOnMock invocation) throws Throwable {
retryLatch.countDown();
return Observable.just(null);
}
});
do {
logger.info("sending Job auto scale Event");
// should trigger a scale up after cooldown period (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
} while (!retryLatch.await(1, TimeUnit.SECONDS));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers + 2 * increment, String.format("Memory with value %1$,.2f exceeded scaleUp threshold of 45.0", (scaleUpAbovePct / 100.0 + 0.01) * 100.0));
}
use of io.mantisrx.runtime.descriptor.StageScalingPolicy in project mantis by Netflix.
the class JobClusterTest method testScaleStage.
@Test
public void testScaleStage() {
TestKit probe = new TestKit(system);
try {
String clusterName = "testScaleStage";
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
assertEquals(SUCCESS, createResp.responseCode);
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.CPU, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.CPU, 0.5, 0.75, null));
smap.put(StageScalingPolicy.ScalingReason.DataDrop, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.DataDrop, 0.0, 2.0, null));
SchedulingInfo SINGLE_WORKER_SCHED_INFO = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerScalableStageWithConstraints(1, DEFAULT_MACHINE_DEFINITION, Lists.newArrayList(), Lists.newArrayList(), new StageScalingPolicy(1, 1, 10, 1, 1, 1, smap)).build();
final JobDefinition jobDefn = createJob(clusterName, 1, MantisJobDurationType.Transient, "USER_TYPE", SINGLE_WORKER_SCHED_INFO, Lists.newArrayList());
String jobId = clusterName + "-1";
jobClusterActor.tell(new SubmitJobRequest(clusterName, "user", Optional.ofNullable(jobDefn)), probe.getRef());
SubmitJobResponse submitResponse = probe.expectMsgClass(SubmitJobResponse.class);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 0, new WorkerId(clusterName, jobId, 0, 1));
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobClusterActor, jobId, 1, new WorkerId(clusterName, jobId, 0, 2));
JobTestHelper.getJobDetailsAndVerify(probe, jobClusterActor, jobId, BaseResponse.ResponseCode.SUCCESS, JobState.Launched);
jobClusterActor.tell(new ScaleStageRequest(jobId, 1, 2, user, "No reason"), probe.getRef());
ScaleStageResponse scaleResp = probe.expectMsgClass(ScaleStageResponse.class);
System.out.println("scale Resp: " + scaleResp.message);
assertEquals(SUCCESS, scaleResp.responseCode);
assertEquals(2, scaleResp.getActualNumWorkers());
verify(jobStoreMock, times(1)).storeNewJob(any());
// initial worker
verify(jobStoreMock, times(1)).storeNewWorkers(any(), any());
// scale up worker
verify(jobStoreMock, times(1)).storeNewWorker(any());
verify(jobStoreMock, times(6)).updateWorker(any());
verify(jobStoreMock, times(3)).updateJob(any());
// initial worker and scale up worker
verify(schedulerMock, times(3)).scheduleWorker(any());
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.runtime.descriptor.StageScalingPolicy in project mantis by Netflix.
the class JobTestLifecycle method testJobSubmitWithMultipleStagesAndWorkers.
@Test
public void testJobSubmitWithMultipleStagesAndWorkers() {
final TestKit probe = new TestKit(system);
String clusterName = "testJobSubmitWithMultipleStagesAndWorkers";
IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName);
JobDefinition jobDefn;
try {
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, 0.1, 0.6, null));
SchedulingInfo.Builder builder = new SchedulingInfo.Builder().numberOfStages(2).multiWorkerScalableStageWithConstraints(2, new MachineDefinition(1, 1.24, 0.0, 1, 1), null, null, new StageScalingPolicy(1, 1, 3, 1, 1, 60, smap)).multiWorkerScalableStageWithConstraints(3, new MachineDefinition(1, 1.24, 0.0, 1, 1), null, null, new StageScalingPolicy(1, 1, 3, 1, 1, 60, smap));
SchedulingInfo sInfo = builder.build();
System.out.println("SchedulingInfo " + sInfo);
jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 1)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
assertEquals(SUCCESS, initMsg.responseCode);
String jobId = clusterName + "-1";
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp + " msg " + resp.message);
assertEquals(SUCCESS, resp.responseCode);
assertEquals(JobState.Accepted, resp.getJobMetadata().get().getState());
int stageNo = 0;
// send launched event
WorkerId workerId = new WorkerId(jobId, 0, 1);
// send heartbeat
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp2 + " msg " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
// Only 1 worker has started.
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
// send launched events for the rest of the workers
int nextWorkerNumber = 1;
int stage = 0;
Iterator<Map.Entry<Integer, StageSchedulingInfo>> it = sInfo.getStages().entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Integer, StageSchedulingInfo> integerStageSchedulingInfoEntry = it.next();
StageSchedulingInfo stageSchedulingInfo = integerStageSchedulingInfoEntry.getValue();
System.out.println("Workers -> " + stageSchedulingInfo.getNumberOfInstances() + " in stage " + stage);
for (int i = 0; i < stageSchedulingInfo.getNumberOfInstances(); i++) {
WorkerId wId = new WorkerId(jobId, i, nextWorkerNumber++);
System.out.println("Sending events for worker --> " + wId + " Stage " + stage);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stage, wId);
}
stage++;
}
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp3 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp3 + " msg " + resp3.message);
assertEquals(SUCCESS, resp3.responseCode);
// 2 worker have started so job should be started.
assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
verify(jobStoreMock, times(1)).storeNewJob(any());
verify(jobStoreMock, times(1)).storeNewWorkers(any(), any());
verify(jobStoreMock, times(19)).updateWorker(any());
verify(jobStoreMock, times(3)).updateJob(any());
// assertEquals(jobActor, probe.getLastSender());
} catch (InvalidJobException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
Aggregations