use of io.mantisrx.runtime.MachineDefinition in project mantis by Netflix.
the class JobAutoScalerTest method testScaleDown.
@Test
public void testScaleDown() throws InterruptedException {
final String jobId = "test-job-1";
final int coolDownSec = 2;
final int scalingStageNum = 1;
final MantisMasterClientApi mockMasterClientApi = mock(MantisMasterClientApi.class);
final Map<Integer, StageSchedulingInfo> schedulingInfoMap = new HashMap<>();
final int numStage1Workers = 2;
final int increment = 1;
final int decrement = 1;
final int min = 1;
final int max = 5;
final double scaleUpAbovePct = 45.0;
final double scaleDownBelowPct = 15.0;
final double workerMemoryMB = 512.0;
final StageSchedulingInfo stage1SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numStage1Workers).machineDefinition(new MachineDefinition(2, workerMemoryMB, 200, 1024, 2)).scalingPolicy(new StageScalingPolicy(scalingStageNum, min, max, increment, decrement, coolDownSec, Collections.singletonMap(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, scaleDownBelowPct, scaleUpAbovePct, new StageScalingPolicy.RollingCount(1, 2))))).scalable(true).build();
schedulingInfoMap.put(scalingStageNum, stage1SchedInfo);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers - decrement), anyString())).thenReturn(Observable.just(true));
Context context = mock(Context.class);
when(context.getWorkerMapObservable()).thenReturn(Observable.empty());
final JobAutoScaler jobAutoScaler = new JobAutoScaler(jobId, new SchedulingInfo(schedulingInfoMap), mockMasterClientApi, context);
jobAutoScaler.start();
final Observer<JobAutoScaler.Event> jobAutoScalerObserver = jobAutoScaler.getObserver();
// should trigger a scale down (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers, ""));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers - decrement, String.format("Memory with value %1$,.2f is below scaleDown threshold of %2$,.1f", (scaleDownBelowPct / 100.0 - 0.01) * 100.0, scaleDownBelowPct));
// should *not* trigger a scale down before cooldown period (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
Thread.sleep(coolDownSec * 1000);
if (numStage1Workers - decrement == min) {
// should not trigger a scale down after cooldown period if numWorkers=min (below 15% scaleDown threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleDownBelowPct / 100.0 - 0.01), numStage1Workers - decrement, ""));
verifyNoMoreInteractions(mockMasterClientApi);
}
}
use of io.mantisrx.runtime.MachineDefinition in project mantis by Netflix.
the class JobAutoScalerTest method testScaleUp.
@Test
public void testScaleUp() throws InterruptedException {
final String jobId = "test-job-1";
final int coolDownSec = 2;
final int scalingStageNum = 1;
final MantisMasterClientApi mockMasterClientApi = mock(MantisMasterClientApi.class);
final Map<Integer, StageSchedulingInfo> schedulingInfoMap = new HashMap<>();
final int numStage1Workers = 1;
final int increment = 1;
final int decrement = 1;
final int min = 1;
final int max = 5;
final double scaleUpAbovePct = 45.0;
final double scaleDownBelowPct = 15.0;
final double workerMemoryMB = 512.0;
final StageSchedulingInfo stage1SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numStage1Workers).machineDefinition(new MachineDefinition(2, workerMemoryMB, 200, 1024, 2)).scalingPolicy(new StageScalingPolicy(scalingStageNum, min, max, increment, decrement, coolDownSec, Collections.singletonMap(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, scaleDownBelowPct, scaleUpAbovePct, new StageScalingPolicy.RollingCount(1, 2))))).scalable(true).build();
schedulingInfoMap.put(scalingStageNum, stage1SchedInfo);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers + increment), anyString())).thenReturn(Observable.just(true));
Context context = mock(Context.class);
when(context.getWorkerMapObservable()).thenReturn(Observable.empty());
final JobAutoScaler jobAutoScaler = new JobAutoScaler(jobId, new SchedulingInfo(schedulingInfoMap), mockMasterClientApi, context);
jobAutoScaler.start();
final Observer<JobAutoScaler.Event> jobAutoScalerObserver = jobAutoScaler.getObserver();
// should trigger a scale up (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers, ""));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers + increment, String.format("Memory with value %1$,.2f exceeded scaleUp threshold of 45.0", (scaleUpAbovePct / 100.0 + 0.01) * 100.0));
// should *not* trigger a scale up before cooldown period (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
Thread.sleep(coolDownSec * 1000);
// retry sending auto scale event till scaleJobStage request sent to master, as there is possible a race between the sleep for coolDownSecs in the Test and the event being processed before coolDownSecs
final CountDownLatch retryLatch = new CountDownLatch(1);
when(mockMasterClientApi.scaleJobStage(eq(jobId), eq(scalingStageNum), eq(numStage1Workers + 2 * increment), anyString())).thenAnswer(new Answer<Observable<Void>>() {
@Override
public Observable<Void> answer(InvocationOnMock invocation) throws Throwable {
retryLatch.countDown();
return Observable.just(null);
}
});
do {
logger.info("sending Job auto scale Event");
// should trigger a scale up after cooldown period (above 45% scaleUp threshold)
jobAutoScalerObserver.onNext(new JobAutoScaler.Event(StageScalingPolicy.ScalingReason.Memory, scalingStageNum, workerMemoryMB * (scaleUpAbovePct / 100.0 + 0.01), numStage1Workers + increment, ""));
} while (!retryLatch.await(1, TimeUnit.SECONDS));
verify(mockMasterClientApi, timeout(1000).times(1)).scaleJobStage(jobId, scalingStageNum, numStage1Workers + 2 * increment, String.format("Memory with value %1$,.2f exceeded scaleUp threshold of 45.0", (scaleUpAbovePct / 100.0 + 0.01) * 100.0));
}
use of io.mantisrx.runtime.MachineDefinition in project mantis by Netflix.
the class JobTestLifecycle method testListActiveWorkers.
@Test
public void testListActiveWorkers() {
final TestKit probe = new TestKit(system);
String clusterName = "testListActiveWorkers";
IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName);
JobDefinition jobDefn;
try {
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerStageWithConstraints(2, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList()).build();
jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 2)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
assertEquals(SUCCESS, initMsg.responseCode);
String jobId = clusterName + "-2";
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp + " msg " + resp.message);
assertEquals(SUCCESS, resp.responseCode);
assertEquals(JobState.Accepted, resp.getJobMetadata().get().getState());
int stageNo = 1;
// send launched event
WorkerId workerId = new WorkerId(jobId, 0, 1);
// send heartbeat
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp2 + " msg " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
// Only 1 worker has started.
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
// send launched event
WorkerId workerId2 = new WorkerId(jobId, 1, 2);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId2);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp3 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp3 + " msg " + resp3.message);
assertEquals(SUCCESS, resp3.responseCode);
// 2 worker have started so job should be started.
assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
jobActor.tell(new JobClusterManagerProto.ListWorkersRequest(new JobId(clusterName, 1)), probe.getRef());
JobClusterManagerProto.ListWorkersResponse listWorkersResponse = probe.expectMsgClass(JobClusterManagerProto.ListWorkersResponse.class);
assertEquals(2, listWorkersResponse.getWorkerMetadata().size());
int cnt = 0;
for (IMantisWorkerMetadata workerMeta : listWorkersResponse.getWorkerMetadata()) {
if (workerMeta.getWorkerNumber() == 1 || workerMeta.getWorkerNumber() == 2) {
cnt++;
}
}
assertEquals(2, cnt);
verify(jobStoreMock, times(1)).storeNewJob(any());
verify(jobStoreMock, times(1)).storeNewWorkers(any(), any());
verify(jobStoreMock, times(6)).updateWorker(any());
verify(jobStoreMock, times(3)).updateJob(any());
// assertEquals(jobActor, probe.getLastSender());
} catch (InvalidJobException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.runtime.MachineDefinition in project mantis by Netflix.
the class JobTestLifecycle method testJobSubmitWithMultipleStagesAndWorkers.
@Test
public void testJobSubmitWithMultipleStagesAndWorkers() {
final TestKit probe = new TestKit(system);
String clusterName = "testJobSubmitWithMultipleStagesAndWorkers";
IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName);
JobDefinition jobDefn;
try {
Map<StageScalingPolicy.ScalingReason, StageScalingPolicy.Strategy> smap = new HashMap<>();
smap.put(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, 0.1, 0.6, null));
SchedulingInfo.Builder builder = new SchedulingInfo.Builder().numberOfStages(2).multiWorkerScalableStageWithConstraints(2, new MachineDefinition(1, 1.24, 0.0, 1, 1), null, null, new StageScalingPolicy(1, 1, 3, 1, 1, 60, smap)).multiWorkerScalableStageWithConstraints(3, new MachineDefinition(1, 1.24, 0.0, 1, 1), null, null, new StageScalingPolicy(1, 1, 3, 1, 1, 60, smap));
SchedulingInfo sInfo = builder.build();
System.out.println("SchedulingInfo " + sInfo);
jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 1)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
assertEquals(SUCCESS, initMsg.responseCode);
String jobId = clusterName + "-1";
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp + " msg " + resp.message);
assertEquals(SUCCESS, resp.responseCode);
assertEquals(JobState.Accepted, resp.getJobMetadata().get().getState());
int stageNo = 0;
// send launched event
WorkerId workerId = new WorkerId(jobId, 0, 1);
// send heartbeat
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp2 + " msg " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
// Only 1 worker has started.
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
// send launched events for the rest of the workers
int nextWorkerNumber = 1;
int stage = 0;
Iterator<Map.Entry<Integer, StageSchedulingInfo>> it = sInfo.getStages().entrySet().iterator();
while (it.hasNext()) {
Map.Entry<Integer, StageSchedulingInfo> integerStageSchedulingInfoEntry = it.next();
StageSchedulingInfo stageSchedulingInfo = integerStageSchedulingInfoEntry.getValue();
System.out.println("Workers -> " + stageSchedulingInfo.getNumberOfInstances() + " in stage " + stage);
for (int i = 0; i < stageSchedulingInfo.getNumberOfInstances(); i++) {
WorkerId wId = new WorkerId(jobId, i, nextWorkerNumber++);
System.out.println("Sending events for worker --> " + wId + " Stage " + stage);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stage, wId);
}
stage++;
}
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp3 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp3 + " msg " + resp3.message);
assertEquals(SUCCESS, resp3.responseCode);
// 2 worker have started so job should be started.
assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
verify(jobStoreMock, times(1)).storeNewJob(any());
verify(jobStoreMock, times(1)).storeNewWorkers(any(), any());
verify(jobStoreMock, times(19)).updateWorker(any());
verify(jobStoreMock, times(3)).updateJob(any());
// assertEquals(jobActor, probe.getLastSender());
} catch (InvalidJobException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.runtime.MachineDefinition in project mantis by Netflix.
the class JobTestLifecycle method testHeartBeatEnforcement.
@Test
public void testHeartBeatEnforcement() {
final TestKit probe = new TestKit(system);
String clusterName = "testHeartBeatEnforcementCluster";
IJobClusterDefinition jobClusterDefn = JobTestHelper.generateJobClusterDefinition(clusterName);
JobDefinition jobDefn;
try {
SchedulingInfo sInfo = new SchedulingInfo.Builder().numberOfStages(1).multiWorkerStageWithConstraints(2, new MachineDefinition(1.0, 1.0, 1.0, 3), Lists.newArrayList(), Lists.newArrayList()).build();
jobDefn = JobTestHelper.generateJobDefinition(clusterName, sInfo);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
MantisJobStore jobStoreMock = mock(MantisJobStore.class);
MantisJobMetadataImpl mantisJobMetaData = new MantisJobMetadataImpl.Builder().withJobId(new JobId(clusterName, 2)).withSubmittedAt(Instant.now()).withJobState(JobState.Accepted).withNextWorkerNumToUse(1).withJobDefinition(jobDefn).build();
final ActorRef jobActor = system.actorOf(JobActor.props(jobClusterDefn, mantisJobMetaData, jobStoreMock, schedulerMock, eventPublisher));
jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
JobProto.JobInitialized initMsg = probe.expectMsgClass(JobProto.JobInitialized.class);
assertEquals(SUCCESS, initMsg.responseCode);
String jobId = clusterName + "-2";
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp + " msg " + resp.message);
assertEquals(SUCCESS, resp.responseCode);
assertEquals(JobState.Accepted, resp.getJobMetadata().get().getState());
int stageNo = 1;
WorkerId workerId = new WorkerId(jobId, 0, 1);
// send Launched, Initiated and heartbeat
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp2 + " msg " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
// Only 1 worker has started.
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
// send launched event
WorkerId workerId2 = new WorkerId(jobId, 1, 2);
JobTestHelper.sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId2);
// check job status again
jobActor.tell(new JobClusterManagerProto.GetJobDetailsRequest("nj", jobId), probe.getRef());
// jobActor.tell(new JobProto.InitJob(probe.getRef()), probe.getRef());
GetJobDetailsResponse resp3 = probe.expectMsgClass(GetJobDetailsResponse.class);
System.out.println("resp " + resp3 + " msg " + resp3.message);
assertEquals(SUCCESS, resp3.responseCode);
// 2 worker have started so job should be started.
assertEquals(JobState.Launched, resp3.getJobMetadata().get().getState());
JobTestHelper.sendHeartBeat(probe, jobActor, jobId, 1, workerId2);
JobTestHelper.sendHeartBeat(probe, jobActor, jobId, 1, workerId);
// check hb status in the future where we expect all last HBs to be stale.
Instant now = Instant.now();
jobActor.tell(new JobProto.CheckHeartBeat(now.plusSeconds(240)), probe.getRef());
Thread.sleep(1000);
// 2 original submissions and 2 resubmits because of HB timeouts
verify(schedulerMock, times(4)).scheduleWorker(any());
// 2 kills due to resubmits
verify(schedulerMock, times(2)).unscheduleAndTerminateWorker(any(), any());
// assertEquals(jobActor, probe.getLastSender());
} catch (InvalidJobException e) {
// TODO Auto-generated catch block
e.printStackTrace();
fail();
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
Aggregations