use of io.mantisrx.common.WorkerPorts in project mantis by Netflix.
the class SchedulingService method launchTasks.
/**
* Attempts to launch tasks given some number of leases from Mesos.
*
* When a task is launched successfully, the following will happen:
*
* 1. Emit a {@link WorkerLaunched} event to be handled by the corresponding actor.
* 2. Makes a call to the underlying Mesos driver to launch the task.
*
* A task can fail to launch if:
*
* 1. It doesn't receive enough metadata for {@link WorkerPorts} to pass its preconditions.
* - No launch task request will be made for this assignment result.
* - Proactively unschedule the worker.
* 2. It fails to emit a {@link WorkerLaunched} event.
* - The worker will get unscheduled for this launch task request.
* 3. There are no launch tasks for this assignment result.
* - All of these leases are rejected.
* - Eventually, the underlying Mesos driver will decline offers since there are no launch task requests.
*
* @param requests collection of assignment results received by the scheduler.
* @param leases list of resource offers from Mesos.
*/
private void launchTasks(Collection<TaskAssignmentResult> requests, List<VirtualMachineLease> leases) {
List<LaunchTaskRequest> launchTaskRequests = new ArrayList<>();
for (TaskAssignmentResult assignmentResult : requests) {
ScheduleRequest request = (ScheduleRequest) assignmentResult.getRequest();
WorkerPorts workerPorts = null;
try {
workerPorts = new WorkerPorts(assignmentResult.getAssignedPorts());
} catch (IllegalArgumentException | IllegalStateException e) {
logger.error("problem launching tasks for assignment result {}: {}", assignmentResult, e);
numMissingWorkerPorts.increment();
}
if (workerPorts != null) {
boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunched(request.getWorkerId(), request.getStageNum(), leases.get(0).hostname(), leases.get(0).getVMID(), getAttribute(leases.get(0), slaveClusterAttributeName), workerPorts));
if (success) {
launchTaskRequests.add(new LaunchTaskRequest(request, workerPorts));
} else {
unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
}
} else {
unscheduleWorker(request.getWorkerId(), Optional.ofNullable(leases.get(0).hostname()));
}
}
if (launchTaskRequests.isEmpty()) {
for (VirtualMachineLease l : leases) virtualMachineService.rejectLease(l);
}
Map<ScheduleRequest, LaunchTaskException> launchErrors = virtualMachineService.launchTasks(launchTaskRequests, leases);
for (TaskAssignmentResult result : requests) {
final ScheduleRequest sre = (ScheduleRequest) result.getRequest();
if (launchErrors.containsKey(sre)) {
String errorMessage = getWorkerStringPrefix(sre.getStageNum(), sre.getWorkerId()) + " failed due to " + launchErrors.get(sre).getMessage();
boolean success = jobMessageRouter.routeWorkerEvent(new WorkerLaunchFailed(sre.getWorkerId(), sre.getStageNum(), errorMessage));
if (!success) {
logger.warn("Failed to route WorkerLaunchFailed for {} (err {})", sre.getWorkerId(), errorMessage);
}
}
}
}
use of io.mantisrx.common.WorkerPorts in project mantis by Netflix.
the class NoOpMantisJobOperations method convertMantisWorkerMetadataWriteableToMantisWorkerMetadata.
/**
* Convert/Deserialize metadata into a {@link JobWorker}.
*
* The converted object could have no worker ports which returns Null.
*
* Legit Cases:
*
* 1. Loaded worker was in Accepted state (hasn't been assigned ports yet).
* 2. Loaded worker was in Archived state but previously archived from Accepted state.
*
* Error Cases:
*
* 1. Loaded worker was in Non-Accepted state (data corruption).
* 2. Loaded worker was in Archived state but previously was running or completed (data corruption, but same
* semantic as Legit Case 2 above.
*
* @return a valid converted job worker.
*/
public static JobWorker convertMantisWorkerMetadataWriteableToMantisWorkerMetadata(MantisWorkerMetadata writeable, LifecycleEventPublisher eventPublisher) {
if (logger.isDebugEnabled()) {
logger.debug("DataFormatAdatper:converting worker {}", writeable);
}
String jobId = writeable.getJobId();
List<Integer> ports = new ArrayList<>(writeable.getNumberOfPorts());
ports.add(writeable.getMetricsPort());
ports.add(writeable.getDebugPort());
ports.add(writeable.getConsolePort());
ports.add(writeable.getCustomPort());
if (writeable.getPorts().size() > 0) {
ports.add(writeable.getPorts().get(0));
}
WorkerPorts workerPorts = null;
try {
workerPorts = new WorkerPorts(ports);
} catch (IllegalArgumentException | IllegalStateException e) {
logger.warn("problem loading worker {} for Job ID {}", writeable.getWorkerId(), jobId, e);
}
JobWorker converted = new JobWorker.Builder().withJobId(jobId).withAcceptedAt(writeable.getAcceptedAt()).withLaunchedAt(writeable.getLaunchedAt()).withStartingAt(writeable.getStartingAt()).withStartedAt(writeable.getStartedAt()).withCompletedAt(writeable.getCompletedAt()).withNumberOfPorts(ports.size()).withWorkerPorts(workerPorts).withResubmitCount(writeable.getTotalResubmitCount()).withResubmitOf(writeable.getResubmitOf()).withSlave(writeable.getSlave()).withSlaveID(writeable.getSlaveID()).withStageNum(writeable.getStageNum()).withState(convertMantisJobStateToWorkerState(writeable.getState())).withWorkerIndex(writeable.getWorkerIndex()).withWorkerNumber(writeable.getWorkerNumber()).withJobCompletedReason(writeable.getReason()).withPreferredCluster(writeable.getCluster()).withLifecycleEventsPublisher(eventPublisher).build();
if (logger.isDebugEnabled()) {
logger.debug("DataFormatAdatper:converted worker {}", converted);
}
return converted;
}
use of io.mantisrx.common.WorkerPorts in project mantis by Netflix.
the class JobClusterManagerTest method testBootStrapJobClustersAndJobs.
@Test
public void testBootStrapJobClustersAndJobs() {
TestKit probe = new TestKit(system);
JobTestHelper.deleteAllFiles();
MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
MantisScheduler schedulerMock = mock(MantisScheduler.class);
ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
JobClustersManagerInitializeResponse iResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
List<String> clusterNames = Lists.newArrayList("testBootStrapJobClustersAndJobs1", "testBootStrapJobClustersAndJobs2", "testBootStrapJobClustersAndJobs3");
String clusterWithNoJob = "testBootStrapJobClusterWithNoJob";
createJobClusterAndAssert(jobClusterManagerActor, clusterWithNoJob);
WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
// Create 3 clusters and submit 1 job each
for (String cluster : clusterNames) {
createJobClusterAndAssert(jobClusterManagerActor, cluster, migrationConfig);
submitJobAndAssert(jobClusterManagerActor, cluster);
if (cluster.equals("testBootStrapJobClustersAndJobs1")) {
// send worker events for job 1 so it goes to started state
String jobId = "testBootStrapJobClustersAndJobs1-1";
WorkerId workerId = new WorkerId(jobId, 0, 1);
WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobClusterManagerActor.tell(launchedEvent, probe.getRef());
WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
jobClusterManagerActor.tell(startInitEvent, probe.getRef());
WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
jobClusterManagerActor.tell(heartBeat, probe.getRef());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
}
}
// kill 1 of the jobs to test archive path
JobClusterManagerProto.KillJobRequest killRequest = new JobClusterManagerProto.KillJobRequest("testBootStrapJobClustersAndJobs2-1", JobCompletedReason.Killed.toString(), "njoshi");
jobClusterManagerActor.tell(killRequest, probe.getRef());
JobClusterManagerProto.KillJobResponse killJobResponse = probe.expectMsgClass(JobClusterManagerProto.KillJobResponse.class);
assertEquals(SUCCESS, killJobResponse.responseCode);
JobTestHelper.sendWorkerTerminatedEvent(probe, jobClusterManagerActor, "testBootStrapJobClustersAndJobs2-1", new WorkerId("testBootStrapJobClustersAndJobs2-1", 0, 1));
try {
Thread.sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Stop job cluster Manager Actor
system.stop(jobClusterManagerActor);
// create new instance
jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
// initialize it
jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES),JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
// probe.expectMsgClass(JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
assertEquals(SUCCESS, initializeResponse.responseCode);
// Get Cluster Config
jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
assertEquals(SUCCESS, clusterResponse.responseCode);
assertTrue(clusterResponse.getJobCluster().isPresent());
WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
// get Job status
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
// Ensure its launched
System.out.println("Resp2 -> " + resp2.message);
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
// 1 jobs should be in completed state
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs2-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its completed
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Completed, resp2.getJobMetadata().get().getState());
jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs3-1").get()), probe.getRef());
resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
// Ensure its Accepted
assertEquals(SUCCESS, resp2.responseCode);
assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
try {
Optional<JobWorker> workerByIndex = resp2.getJobMetadata().get().getWorkerByIndex(1, 0);
assertTrue(workerByIndex.isPresent());
Optional<IMantisStageMetadata> stageMetadata = resp2.getJobMetadata().get().getStageMetadata(1);
assertTrue(stageMetadata.isPresent());
JobWorker workerByIndex1 = stageMetadata.get().getWorkerByIndex(0);
System.out.println("Got worker by index : " + workerByIndex1);
Optional<JobWorker> worker = resp2.getJobMetadata().get().getWorkerByNumber(1);
assertTrue(worker.isPresent());
} catch (io.mantisrx.server.master.persistence.exceptions.InvalidJobException e) {
e.printStackTrace();
}
jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
});
jobClusterManagerActor.tell(new GetJobClusterRequest(clusterWithNoJob), probe.getRef());
GetJobClusterResponse jobClusterResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobClusterResponse.class);
assertEquals(SUCCESS, jobClusterResponse.responseCode);
assertTrue(jobClusterResponse.getJobCluster().isPresent());
assertEquals(clusterWithNoJob, jobClusterResponse.getJobCluster().get().getName());
// 1 running worker
verify(schedulerMock, timeout(100_1000).times(1)).initializeRunningWorker(any(), any());
// 2 worker schedule requests
verify(schedulerMock, timeout(100_000).times(4)).scheduleWorker(any());
try {
Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
Mockito.verify(jobStoreSpied).loadAllActiveJobs();
Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
Mockito.verify(jobStoreSpied).archiveWorker(any());
Mockito.verify(jobStoreSpied).archiveJob(any());
} catch (IOException e) {
e.printStackTrace();
fail();
}
}
use of io.mantisrx.common.WorkerPorts in project mantis by Netflix.
the class JobTestHelper method sendWorkerLaunchedEvent.
public static void sendWorkerLaunchedEvent(final TestKit probe, final ActorRef jobActor, WorkerId workerId2, int stageNo) {
WorkerEvent launchedEvent2 = new WorkerLaunched(workerId2, stageNo, "host1", "vm1", Optional.empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
jobActor.tell(launchedEvent2, probe.getRef());
}
use of io.mantisrx.common.WorkerPorts in project mantis by Netflix.
the class VirtualMachineWorkerServiceLocalImpl method createExecuteStageRequest.
private WrappedExecuteStageRequest createExecuteStageRequest() throws MalformedURLException {
// TODO make ExecuteStageRequest params configurable
final long timeoutToReportStartSec = 5;
final URL jobJarUrl = new URL("file:/Users/nmahilani/Projects/Mantis/mantis-sdk/examples/sine-function/build/distributions/sine-function-1.0.zip");
final List<Integer> ports = Arrays.asList(31015, 31013, 31014);
final List<Parameter> params = Collections.singletonList(new Parameter("useRandom", "true"));
final int numInstances = 1;
// new MachineDefinition(2, 300, 200, 1024, 2), true));
final Map<Integer, StageSchedulingInfo> schedulingInfoMap = new HashMap<>();
final StageSchedulingInfo stage0SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numInstances).machineDefinition(MachineDefinitions.micro()).build();
final StageSchedulingInfo stage1SchedInfo = StageSchedulingInfo.builder().numberOfInstances(numInstances).machineDefinition(new MachineDefinition(2, 300, 200, 1024, 2)).scalingPolicy(new StageScalingPolicy(1, 1, 5, 1, 1, 30, Collections.singletonMap(StageScalingPolicy.ScalingReason.Memory, new StageScalingPolicy.Strategy(StageScalingPolicy.ScalingReason.Memory, 15.0, 25.0, new StageScalingPolicy.RollingCount(1, 2))))).scalable(true).build();
// schedulingInfoMap.put(0, stage0SchedInfo);
schedulingInfoMap.put(1, stage1SchedInfo);
final SchedulingInfo schedInfo = new SchedulingInfo(schedulingInfoMap);
final ExecuteStageRequest executeStageRequest = new ExecuteStageRequest(workerInfo.getJobName(), workerInfo.getJobId(), workerInfo.getWorkerIndex(), workerInfo.getWorkerNumber(), jobJarUrl, workerInfo.getStageNumber(), workerInfo.getNumStages(), ports, timeoutToReportStartSec, workerInfo.getMetricsPort(), params, schedInfo, MantisJobDurationType.Transient, 0L, 0L, new WorkerPorts(Arrays.asList(7151, 7152, 7153, 7154, 7155)));
return new WrappedExecuteStageRequest(PublishSubject.<Boolean>create(), executeStageRequest);
}
Aggregations