Search in sources :

Example 1 with WorkerHeartbeat

use of io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat in project mantis by Netflix.

the class JobRoute method getJobRoutes.

private Route getJobRoutes() {
    return route(path(STATUS_ENDPOINT, () -> post(() -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
        if (logger.isDebugEnabled()) {
            logger.debug("/api/postjobstatus called {}", req);
        }
        try {
            workerHeartbeatStatusPOST.increment();
            PostJobStatusRequest postJobStatusRequest = Jackson.fromJSON(req, PostJobStatusRequest.class);
            WorkerEvent workerStatusRequest = createWorkerStatusRequest(postJobStatusRequest);
            if (workerStatusRequest instanceof WorkerHeartbeat) {
                if (!ConfigurationProvider.getConfig().isHeartbeatProcessingEnabled()) {
                    // skip heartbeat processing
                    if (logger.isTraceEnabled()) {
                        logger.trace("skipped heartbeat event {}", workerStatusRequest);
                    }
                    workerHeartbeatSkipped.increment();
                    return complete(StatusCodes.OK);
                }
            }
            return completeWithFuture(jobRouteHandler.workerStatus(workerStatusRequest).thenApply(this::toHttpResponse));
        } catch (IOException e) {
            logger.warn("Error handling job status {}", req, e);
            return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid JSON payload to post job status\"}");
        }
    })))), pathPrefix(API_JOBS, () -> route(post(() -> route(path(KILL_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
        logger.debug("/api/jobs/kill called {}", req);
        try {
            final KillJobRequest killJobRequest = Jackson.fromJSON(req, KillJobRequest.class);
            return completeWithFuture(jobRouteHandler.kill(killJobRequest).thenApply(resp -> {
                if (resp.responseCode == BaseResponse.ResponseCode.SUCCESS) {
                    return new JobClusterManagerProto.KillJobResponse(resp.requestId, resp.responseCode, resp.getState(), "[\"" + resp.getJobId().getId() + " Killed\"]", resp.getJobId(), resp.getUser());
                } else if (resp.responseCode == BaseResponse.ResponseCode.CLIENT_ERROR) {
                    // for backwards compatibility with old master
                    return new JobClusterManagerProto.KillJobResponse(resp.requestId, BaseResponse.ResponseCode.SUCCESS, resp.getState(), "[\"" + resp.message + " \"]", resp.getJobId(), resp.getUser());
                }
                return resp;
            }).thenApply(this::toHttpResponse));
        } catch (IOException e) {
            logger.warn("Error on job kill {}", req, e);
            return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to kill job\"}");
        }
    }))), path(RESUBMIT_WORKER_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
        logger.debug("/api/jobs/resubmitWorker called {}", req);
        try {
            final ResubmitWorkerRequest resubmitWorkerRequest = Jackson.fromJSON(req, ResubmitWorkerRequest.class);
            return completeWithFuture(jobRouteHandler.resubmitWorker(resubmitWorkerRequest).thenApply(this::toHttpResponse));
        } catch (IOException e) {
            logger.warn("Error on worker resubmit {}", req, e);
            return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to resubmit worker\"}");
        }
    }))), path(SCALE_STAGE_ENDPOINT, () -> decodeRequest(() -> entity(Unmarshaller.entityToString(), req -> {
        logger.debug("/api/jobs/scaleStage called {}", req);
        try {
            ScaleStageRequest scaleStageRequest = Jackson.fromJSON(req, ScaleStageRequest.class);
            int numWorkers = scaleStageRequest.getNumWorkers();
            int maxWorkersPerStage = ConfigurationProvider.getConfig().getMaxWorkersPerStage();
            if (numWorkers > maxWorkersPerStage) {
                logger.warn("rejecting ScaleStageRequest {} with invalid num workers", scaleStageRequest);
                return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"num workers must be less than " + maxWorkersPerStage + "\"}");
            }
            return completeWithFuture(jobRouteHandler.scaleStage(scaleStageRequest).thenApply(this::toHttpResponse));
        } catch (IOException e) {
            logger.warn("Error scaling stage {}", req, e);
            return complete(StatusCodes.BAD_REQUEST, "{\"error\": \"invalid json payload to scale stage " + e.getMessage() + "\"}");
        }
    }))))), get(() -> route(// - optional labels.op query param - default value is 'or' if not specified (other possible value is 'and'
    path(segment("list"), () -> {
        jobListGET.increment();
        return jobListRoute(Optional.empty());
    }), path(segment("list").slash("matchinglabels"), () -> {
        jobListLabelMatchGET.increment();
        return jobListRoute(Optional.empty());
    }), path(segment("list").slash(PathMatchers.segment()), (jobId) -> {
        logger.debug("/api/jobs/list/{} called", jobId);
        jobListJobIdGET.increment();
        return completeAsync(jobRouteHandler.getJobDetails(new JobClusterManagerProto.GetJobDetailsRequest("masterAPI", jobId)), resp -> {
            Optional<MantisJobMetadataView> mantisJobMetadataView = resp.getJobMetadata().map(metaData -> new MantisJobMetadataView(metaData, Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), false));
            return completeOK(mantisJobMetadataView, Jackson.marshaller());
        });
    }), path(segment("list").slash("matching").slash(PathMatchers.segment()), (regex) -> {
        jobListRegexGET.increment();
        return jobListRoute(Optional.ofNullable(regex).filter(r -> !r.isEmpty()));
    }), path(segment("archived").slash(PathMatchers.segment()), (jobId) -> parameterOptional(StringUnmarshallers.INTEGER, "limit", (limit) -> {
        jobArchivedWorkersGET.increment();
        Optional<JobId> jobIdO = JobId.fromId(jobId);
        if (jobIdO.isPresent()) {
            ListArchivedWorkersRequest req = new ListArchivedWorkersRequest(jobIdO.get(), limit.orElse(DEFAULT_LIST_ARCHIVED_WORKERS_LIMIT));
            return alwaysCache(cache, requestUriKeyer, () -> extractUri(uri -> completeAsync(jobRouteHandler.listArchivedWorkers(req), resp -> {
                List<MantisWorkerMetadataWritable> workers = resp.getWorkerMetadata().stream().map(wm -> DataFormatAdapter.convertMantisWorkerMetadataToMantisWorkerMetadataWritable(wm)).collect(Collectors.toList());
                return completeOK(workers, Jackson.marshaller());
            })));
        } else {
            return complete(StatusCodes.BAD_REQUEST, "error: 'archived/<jobId>' request must include a valid jobId");
        }
    })), path(segment("archived"), () -> {
        jobArchivedWorkersGETInvalid.increment();
        return complete(StatusCodes.BAD_REQUEST, "error: 'archived' Request must include jobId");
    }))))));
}
Also used : JobId(io.mantisrx.server.master.domain.JobId) Uri(akka.http.javadsl.model.Uri) MasterConfiguration(io.mantisrx.server.master.config.MasterConfiguration) Arrays(java.util.Arrays) JavaPartialFunction(akka.japi.JavaPartialFunction) Cache(akka.http.caching.javadsl.Cache) JobRouteHandler(io.mantisrx.master.api.akka.route.handlers.JobRouteHandler) LoggerFactory(org.slf4j.LoggerFactory) RequestContext(akka.http.javadsl.server.RequestContext) PathMatchers.segment(akka.http.javadsl.server.PathMatchers.segment) StringUnmarshallers(akka.http.javadsl.unmarshalling.StringUnmarshallers) PathMatcher0(akka.http.javadsl.server.PathMatcher0) ListArchivedWorkersRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersRequest) JobClusterProtoAdapter(io.mantisrx.master.api.akka.route.proto.JobClusterProtoAdapter) Unmarshaller(akka.http.javadsl.unmarshalling.Unmarshaller) Metrics(io.mantisrx.common.metrics.Metrics) ResubmitWorkerRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerRequest) PostJobStatusRequest(io.mantisrx.server.core.PostJobStatusRequest) Jackson(io.mantisrx.master.api.akka.route.Jackson) BaseResponse(io.mantisrx.master.jobcluster.proto.BaseResponse) ScaleStageRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageRequest) HttpMethods(akka.http.javadsl.model.HttpMethods) Collectors(java.util.stream.Collectors) ExceptionHandler(akka.http.javadsl.server.ExceptionHandler) List(java.util.List) JobRouteUtils.createListJobIdsRequest(io.mantisrx.master.api.akka.route.utils.JobRouteUtils.createListJobIdsRequest) CachingDirectives.alwaysCache(akka.http.javadsl.server.directives.CachingDirectives.alwaysCache) ActorSystem(akka.actor.ActorSystem) ConfigurationProvider(io.mantisrx.server.master.config.ConfigurationProvider) Optional(java.util.Optional) DataFormatAdapter(io.mantisrx.server.master.domain.DataFormatAdapter) MantisJobMetadataView(io.mantisrx.master.jobcluster.job.MantisJobMetadataView) MantisWorkerMetadataWritable(io.mantisrx.server.master.store.MantisWorkerMetadataWritable) JobRouteUtils.createListJobsRequest(io.mantisrx.master.api.akka.route.utils.JobRouteUtils.createListJobsRequest) Route(akka.http.javadsl.server.Route) MetricsRegistry(io.mantisrx.common.metrics.MetricsRegistry) StatusCodes(akka.http.javadsl.model.StatusCodes) Function(java.util.function.Function) PathMatchers(akka.http.javadsl.server.PathMatchers) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.KillJobRequest) Counter(io.mantisrx.common.metrics.Counter) Logger(org.slf4j.Logger) RouteResult(akka.http.javadsl.server.RouteResult) HttpHeader(akka.http.javadsl.model.HttpHeader) HttpRequest(akka.http.javadsl.model.HttpRequest) IOException(java.io.IOException) DEFAULT_LIST_ARCHIVED_WORKERS_LIMIT(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersRequest.DEFAULT_LIST_ARCHIVED_WORKERS_LIMIT) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) JobRouteUtils.createWorkerStatusRequest(io.mantisrx.master.api.akka.route.utils.JobRouteUtils.createWorkerStatusRequest) Collections(java.util.Collections) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) Optional(java.util.Optional) PostJobStatusRequest(io.mantisrx.server.core.PostJobStatusRequest) IOException(java.io.IOException) ScaleStageRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageRequest) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) MantisJobMetadataView(io.mantisrx.master.jobcluster.job.MantisJobMetadataView) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) ResubmitWorkerRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerRequest) KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.KillJobRequest) MantisWorkerMetadataWritable(io.mantisrx.server.master.store.MantisWorkerMetadataWritable) ListArchivedWorkersRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersRequest) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto)

Example 2 with WorkerHeartbeat

use of io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat in project mantis by Netflix.

the class JobClusterTest method testZombieWorkerKilledOnMessage.

@Test
public void testZombieWorkerKilledOnMessage() {
    String clusterName = "testZombieWorkerKilledOnMessage";
    TestKit probe = new TestKit(system);
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    MantisJobStore jobStoreMock = mock(MantisJobStore.class);
    final JobClusterDefinitionImpl fakeJobCluster = createFakeJobClusterDefn(clusterName);
    ActorRef jobClusterActor = system.actorOf(props(clusterName, jobStoreMock, schedulerMock, eventPublisher));
    jobClusterActor.tell(new JobClusterProto.InitializeJobClusterRequest(fakeJobCluster, user, probe.getRef()), probe.getRef());
    JobClusterProto.InitializeJobClusterResponse createResp = probe.expectMsgClass(JobClusterProto.InitializeJobClusterResponse.class);
    assertEquals(SUCCESS, createResp.responseCode);
    try {
        String jobId = clusterName + "-1";
        WorkerId workerId = new WorkerId(clusterName, jobId, 0, 1);
        WorkerEvent heartBeat2 = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started, System.currentTimeMillis()));
        jobClusterActor.tell(heartBeat2, probe.getRef());
        jobClusterActor.tell(new GetJobClusterRequest(clusterName), probe.getRef());
        GetJobClusterResponse resp = probe.expectMsgClass(GetJobClusterResponse.class);
        assertEquals(clusterName, resp.getJobCluster().get().getName());
        verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(workerId, empty());
    } catch (Exception e) {
        e.printStackTrace();
        fail();
    }
}
Also used : Status(io.mantisrx.server.core.Status) JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) ActorRef(akka.actor.ActorRef) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) Matchers.anyString(org.mockito.Matchers.anyString) TestKit(akka.testkit.javadsl.TestKit) WorkerId(io.mantisrx.server.core.domain.WorkerId) InvalidJobException(io.mantisrx.runtime.command.InvalidJobException) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) Test(org.junit.Test)

Example 3 with WorkerHeartbeat

use of io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat in project mantis by Netflix.

the class JobClusterManagerTest method testBootStrapJobClustersAndJobs.

@Test
public void testBootStrapJobClustersAndJobs() {
    TestKit probe = new TestKit(system);
    JobTestHelper.deleteAllFiles();
    MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
    MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
    JobClustersManagerInitializeResponse iResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
    List<String> clusterNames = Lists.newArrayList("testBootStrapJobClustersAndJobs1", "testBootStrapJobClustersAndJobs2", "testBootStrapJobClustersAndJobs3");
    String clusterWithNoJob = "testBootStrapJobClusterWithNoJob";
    createJobClusterAndAssert(jobClusterManagerActor, clusterWithNoJob);
    WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
    // Create 3 clusters and submit 1 job each
    for (String cluster : clusterNames) {
        createJobClusterAndAssert(jobClusterManagerActor, cluster, migrationConfig);
        submitJobAndAssert(jobClusterManagerActor, cluster);
        if (cluster.equals("testBootStrapJobClustersAndJobs1")) {
            // send worker events for job 1 so it goes to started state
            String jobId = "testBootStrapJobClustersAndJobs1-1";
            WorkerId workerId = new WorkerId(jobId, 0, 1);
            WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
            jobClusterManagerActor.tell(launchedEvent, probe.getRef());
            WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
            jobClusterManagerActor.tell(startInitEvent, probe.getRef());
            WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
            jobClusterManagerActor.tell(heartBeat, probe.getRef());
            // get Job status
            jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
            GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
            // Ensure its launched
            assertEquals(SUCCESS, resp2.responseCode);
            assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
        }
    }
    // kill 1 of the jobs to test archive path
    JobClusterManagerProto.KillJobRequest killRequest = new JobClusterManagerProto.KillJobRequest("testBootStrapJobClustersAndJobs2-1", JobCompletedReason.Killed.toString(), "njoshi");
    jobClusterManagerActor.tell(killRequest, probe.getRef());
    JobClusterManagerProto.KillJobResponse killJobResponse = probe.expectMsgClass(JobClusterManagerProto.KillJobResponse.class);
    assertEquals(SUCCESS, killJobResponse.responseCode);
    JobTestHelper.sendWorkerTerminatedEvent(probe, jobClusterManagerActor, "testBootStrapJobClustersAndJobs2-1", new WorkerId("testBootStrapJobClustersAndJobs2-1", 0, 1));
    try {
        Thread.sleep(500);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    // Stop job cluster Manager Actor
    system.stop(jobClusterManagerActor);
    // create new instance
    jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    // initialize it
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
    JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
    // probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES),JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
    // probe.expectMsgClass(JobClusterManagerProto.JobClustersManagerInitializeResponse.class);
    assertEquals(SUCCESS, initializeResponse.responseCode);
    // Get Cluster Config
    jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, clusterResponse.responseCode);
    assertTrue(clusterResponse.getJobCluster().isPresent());
    WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
    assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
    assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
    GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    System.out.println("Resp2 -> " + resp2.message);
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
    // 1 jobs should be in completed state
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs2-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
    // Ensure its completed
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Completed, resp2.getJobMetadata().get().getState());
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs3-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobDetailsResponse.class);
    // Ensure its Accepted
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Accepted, resp2.getJobMetadata().get().getState());
    try {
        Optional<JobWorker> workerByIndex = resp2.getJobMetadata().get().getWorkerByIndex(1, 0);
        assertTrue(workerByIndex.isPresent());
        Optional<IMantisStageMetadata> stageMetadata = resp2.getJobMetadata().get().getStageMetadata(1);
        assertTrue(stageMetadata.isPresent());
        JobWorker workerByIndex1 = stageMetadata.get().getWorkerByIndex(0);
        System.out.println("Got worker by index : " + workerByIndex1);
        Optional<JobWorker> worker = resp2.getJobMetadata().get().getWorkerByNumber(1);
        assertTrue(worker.isPresent());
    } catch (io.mantisrx.server.master.persistence.exceptions.InvalidJobException e) {
        e.printStackTrace();
    }
    jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
    lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
        assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
    });
    jobClusterManagerActor.tell(new GetJobClusterRequest(clusterWithNoJob), probe.getRef());
    GetJobClusterResponse jobClusterResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetJobClusterResponse.class);
    assertEquals(SUCCESS, jobClusterResponse.responseCode);
    assertTrue(jobClusterResponse.getJobCluster().isPresent());
    assertEquals(clusterWithNoJob, jobClusterResponse.getJobCluster().get().getName());
    // 1 running worker
    verify(schedulerMock, timeout(100_1000).times(1)).initializeRunningWorker(any(), any());
    // 2 worker schedule requests
    verify(schedulerMock, timeout(100_000).times(4)).scheduleWorker(any());
    try {
        Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
        Mockito.verify(jobStoreSpied).loadAllActiveJobs();
        Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
        Mockito.verify(jobStoreSpied).archiveWorker(any());
        Mockito.verify(jobStoreSpied).archiveJob(any());
    } catch (IOException e) {
        e.printStackTrace();
        fail();
    }
}
Also used : GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) GetLastSubmittedJobIdStreamResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse) ActorRef(akka.actor.ActorRef) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) WorkerLaunched(io.mantisrx.server.master.scheduler.WorkerLaunched) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) JobClustersManagerInitializeResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse) Status(io.mantisrx.server.core.Status) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) TestKit(akka.testkit.javadsl.TestKit) IOException(java.io.IOException) WorkerId(io.mantisrx.server.core.domain.WorkerId) JobWorker(io.mantisrx.master.jobcluster.job.worker.JobWorker) MantisStorageProviderAdapter(io.mantisrx.server.master.persistence.MantisStorageProviderAdapter) GetLastSubmittedJobIdStreamRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) WorkerPorts(io.mantisrx.common.WorkerPorts) Test(org.junit.Test)

Example 4 with WorkerHeartbeat

use of io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat in project mantis by Netflix.

the class JobClusterManagerTest method testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts.

/**
 * Case for a master leader re-election when a new master re-hydrates corrupted job worker metadata.
 */
@Test
public void testBootstrapJobClusterAndJobsWithCorruptedWorkerPorts() throws IOException, io.mantisrx.server.master.persistence.exceptions.InvalidJobException {
    TestKit probe = new TestKit(system);
    JobTestHelper.deleteAllFiles();
    MantisJobStore jobStore = new MantisJobStore(new MantisStorageProviderAdapter(new io.mantisrx.server.master.store.SimpleCachedFileStorageProvider(), eventPublisher));
    MantisJobStore jobStoreSpied = Mockito.spy(jobStore);
    MantisScheduler schedulerMock = mock(MantisScheduler.class);
    ActorRef jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, false), probe.getRef());
    probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), JobClustersManagerInitializeResponse.class);
    String jobClusterName = "testBootStrapJobClustersAndJobs1";
    WorkerMigrationConfig migrationConfig = new WorkerMigrationConfig(MigrationStrategyEnum.PERCENTAGE, "{\"percentToMove\":60, \"intervalMs\":30000}");
    createJobClusterAndAssert(jobClusterManagerActor, jobClusterName, migrationConfig);
    submitJobAndAssert(jobClusterManagerActor, jobClusterName);
    String jobId = "testBootStrapJobClustersAndJobs1-1";
    WorkerId workerId = new WorkerId(jobId, 0, 1);
    WorkerEvent launchedEvent = new WorkerLaunched(workerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
    jobClusterManagerActor.tell(launchedEvent, probe.getRef());
    WorkerEvent startInitEvent = new WorkerStatus(new Status(workerId.getJobId(), 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.INFO, "test START_INIT", MantisJobState.StartInitiated));
    jobClusterManagerActor.tell(startInitEvent, probe.getRef());
    WorkerEvent heartBeat = new WorkerHeartbeat(new Status(jobId, 1, workerId.getWorkerIndex(), workerId.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started));
    jobClusterManagerActor.tell(heartBeat, probe.getRef());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId(jobId).get()), probe.getRef());
    GetJobDetailsResponse resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    assertEquals(SUCCESS, resp2.responseCode);
    JobWorker worker = new JobWorker.Builder().withWorkerIndex(0).withWorkerNumber(1).withJobId(jobId).withStageNum(1).withNumberOfPorts(5).withWorkerPorts(null).withState(WorkerState.Started).withLifecycleEventsPublisher(eventPublisher).build();
    jobStoreSpied.updateWorker(worker.getMetadata());
    // Stop job cluster Manager Actor
    system.stop(jobClusterManagerActor);
    // create new instance
    jobClusterManagerActor = system.actorOf(JobClustersManagerActor.props(jobStoreSpied, eventPublisher));
    // initialize it
    jobClusterManagerActor.tell(new JobClusterManagerProto.JobClustersManagerInitialize(schedulerMock, true), probe.getRef());
    JobClustersManagerInitializeResponse initializeResponse = probe.expectMsgClass(JobClustersManagerInitializeResponse.class);
    assertEquals(SUCCESS, initializeResponse.responseCode);
    WorkerId newWorkerId = new WorkerId(jobId, 0, 11);
    launchedEvent = new WorkerLaunched(newWorkerId, 0, "host1", "vm1", empty(), new WorkerPorts(Lists.newArrayList(8000, 9000, 9010, 9020, 9030)));
    jobClusterManagerActor.tell(launchedEvent, probe.getRef());
    // Get Cluster Config
    jobClusterManagerActor.tell(new GetJobClusterRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetJobClusterResponse clusterResponse = probe.expectMsgClass(GetJobClusterResponse.class);
    assertEquals(SUCCESS, clusterResponse.responseCode);
    assertTrue(clusterResponse.getJobCluster().isPresent());
    WorkerMigrationConfig mConfig = clusterResponse.getJobCluster().get().getMigrationConfig();
    assertEquals(migrationConfig.getStrategy(), mConfig.getStrategy());
    assertEquals(migrationConfig.getConfigString(), migrationConfig.getConfigString());
    // get Job status
    jobClusterManagerActor.tell(new GetJobDetailsRequest("user", JobId.fromId("testBootStrapJobClustersAndJobs1-1").get()), probe.getRef());
    resp2 = probe.expectMsgClass(GetJobDetailsResponse.class);
    // Ensure its launched
    assertEquals(SUCCESS, resp2.responseCode);
    assertEquals(JobState.Launched, resp2.getJobMetadata().get().getState());
    IMantisWorkerMetadata mantisWorkerMetadata = resp2.getJobMetadata().get().getWorkerByIndex(1, 0).get().getMetadata();
    assertNotNull(mantisWorkerMetadata.getWorkerPorts());
    assertEquals(11, mantisWorkerMetadata.getWorkerNumber());
    assertEquals(1, mantisWorkerMetadata.getTotalResubmitCount());
    jobClusterManagerActor.tell(new GetLastSubmittedJobIdStreamRequest("testBootStrapJobClustersAndJobs1"), probe.getRef());
    GetLastSubmittedJobIdStreamResponse lastSubmittedJobIdStreamResponse = probe.expectMsgClass(Duration.of(10, ChronoUnit.MINUTES), GetLastSubmittedJobIdStreamResponse.class);
    lastSubmittedJobIdStreamResponse.getjobIdBehaviorSubject().get().take(1).toBlocking().subscribe((jId) -> {
        assertEquals(new JobId("testBootStrapJobClustersAndJobs1", 1), jId);
    });
    // Two schedules: one for the initial success, one for a resubmit from corrupted worker ports.
    verify(schedulerMock, times(2)).scheduleWorker(any());
    // One unschedule from corrupted worker ID 1 (before the resubmit).
    verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(eq(workerId), any());
    try {
        Mockito.verify(jobStoreSpied).loadAllArchivedJobsAsync();
        Mockito.verify(jobStoreSpied).loadAllActiveJobs();
        Mockito.verify(jobStoreSpied).loadAllCompletedJobs();
        Mockito.verify(jobStoreSpied).archiveWorker(any());
    } catch (IOException e) {
        e.printStackTrace();
        fail();
    }
}
Also used : GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) GetLastSubmittedJobIdStreamResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse) ActorRef(akka.actor.ActorRef) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) WorkerMigrationConfig(io.mantisrx.runtime.WorkerMigrationConfig) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) IMantisWorkerMetadata(io.mantisrx.master.jobcluster.job.worker.IMantisWorkerMetadata) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) WorkerLaunched(io.mantisrx.server.master.scheduler.WorkerLaunched) JobId(io.mantisrx.server.master.domain.JobId) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) JobClustersManagerInitializeResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse) Status(io.mantisrx.server.core.Status) WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) TestKit(akka.testkit.javadsl.TestKit) IOException(java.io.IOException) WorkerId(io.mantisrx.server.core.domain.WorkerId) JobWorker(io.mantisrx.master.jobcluster.job.worker.JobWorker) MantisStorageProviderAdapter(io.mantisrx.server.master.persistence.MantisStorageProviderAdapter) GetLastSubmittedJobIdStreamRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) WorkerPorts(io.mantisrx.common.WorkerPorts) Test(org.junit.Test)

Example 5 with WorkerHeartbeat

use of io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat in project mantis by Netflix.

the class JobTestHelper method sendHeartBeat.

// public static void sendLaunchedInitiatedStartedEventsToWorker(final TestKit probe, final ActorRef jobActor, String jobId,
// int stageNo, WorkerId workerId2) {
// sendLaunchedInitiatedStartedEventsToWorker(probe, jobActor, jobId, stageNo, workerId2, System.currentTimeMillis() + 1000);
// }
public static void sendHeartBeat(final TestKit probe, final ActorRef jobActor, String jobId, int stageNo, WorkerId workerId2, long time) {
    WorkerEvent heartBeat2 = new WorkerHeartbeat(new Status(jobId, stageNo, workerId2.getWorkerIndex(), workerId2.getWorkerNum(), TYPE.HEARTBEAT, "", MantisJobState.Started, time));
    jobActor.tell(heartBeat2, probe.getRef());
}
Also used : WorkerStatus(io.mantisrx.master.jobcluster.job.worker.WorkerStatus) Status(io.mantisrx.server.core.Status) WorkerHeartbeat(io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent)

Aggregations

WorkerHeartbeat (io.mantisrx.master.jobcluster.job.worker.WorkerHeartbeat)6 WorkerEvent (io.mantisrx.server.master.scheduler.WorkerEvent)6 Status (io.mantisrx.server.core.Status)5 ActorRef (akka.actor.ActorRef)3 TestKit (akka.testkit.javadsl.TestKit)3 WorkerPorts (io.mantisrx.common.WorkerPorts)3 WorkerStatus (io.mantisrx.master.jobcluster.job.worker.WorkerStatus)3 JobClusterManagerProto (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto)3 GetJobClusterRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest)3 GetJobClusterResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse)3 WorkerId (io.mantisrx.server.core.domain.WorkerId)3 JobId (io.mantisrx.server.master.domain.JobId)3 MantisJobStore (io.mantisrx.server.master.persistence.MantisJobStore)3 MantisScheduler (io.mantisrx.server.master.scheduler.MantisScheduler)3 IOException (java.io.IOException)3 Test (org.junit.Test)3 WorkerLaunched (io.mantisrx.server.master.scheduler.WorkerLaunched)2 ActorSystem (akka.actor.ActorSystem)1 Cache (akka.http.caching.javadsl.Cache)1 HttpHeader (akka.http.javadsl.model.HttpHeader)1