Search in sources :

Example 1 with SLA

use of io.mantisrx.server.master.domain.SLA in project mantis by Netflix.

the class JobClusterActor method onJobClusterUpdateSLA.

@Override
public void onJobClusterUpdateSLA(UpdateJobClusterSLARequest slaRequest) {
    if (logger.isTraceEnabled()) {
        logger.trace("Enter onJobClusterUpdateSLA {}", slaRequest);
    }
    ActorRef sender = getSender();
    try {
        SLA newSla = new SLA(slaRequest.getMin(), slaRequest.getMax(), slaRequest.getCronSpec(), slaRequest.getCronPolicy());
        JobClusterDefinitionImpl updatedDefn = new JobClusterDefinitionImpl.Builder().from(jobClusterMetadata.getJobClusterDefinition()).withSla(newSla).build();
        boolean isDisabled = jobClusterMetadata.isDisabled();
        if (slaRequest.isForceEnable() && jobClusterMetadata.isDisabled()) {
            isDisabled = false;
        }
        IJobClusterMetadata jobCluster = new JobClusterMetadataImpl.Builder().withIsDisabled(isDisabled).withLastJobCount(jobClusterMetadata.getLastJobCount()).withJobClusterDefinition(updatedDefn).build();
        updateAndSaveJobCluster(jobCluster);
        if (cronManager != null)
            cronManager.destroyCron();
        this.cronManager = new CronManager(name, getSelf(), newSla);
        sender.tell(new UpdateJobClusterSLAResponse(slaRequest.requestId, SUCCESS, name + " SLA updated"), getSelf());
        eventPublisher.publishAuditEvent(new LifecycleEventsProto.AuditEvent(LifecycleEventsProto.AuditEvent.AuditEventType.JOB_CLUSTER_UPDATE, jobClusterMetadata.getJobClusterDefinition().getName(), name + " SLA update"));
    } catch (IllegalArgumentException e) {
        logger.error("Invalid arguement job cluster not updated ", e);
        sender.tell(new UpdateJobClusterSLAResponse(slaRequest.requestId, CLIENT_ERROR, name + " Job cluster SLA updation failed " + e.getMessage()), getSelf());
    } catch (Exception e) {
        logger.error("job cluster not updated ", e);
        sender.tell(new UpdateJobClusterSLAResponse(slaRequest.requestId, SERVER_ERROR, name + " Job cluster SLA updation failed " + e.getMessage()), getSelf());
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Exit onJobClusterUpdateSLA {}", slaRequest);
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobClusterDefinitionImpl(io.mantisrx.server.master.domain.JobClusterDefinitionImpl) SLA(io.mantisrx.server.master.domain.SLA) TriggerNotFoundException(com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException) SchedulerException(com.netflix.fenzo.triggers.exceptions.SchedulerException) JobClusterAlreadyExistsException(io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException) UpdateJobClusterSLAResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLAResponse) LifecycleEventsProto(io.mantisrx.master.events.LifecycleEventsProto)

Example 2 with SLA

use of io.mantisrx.server.master.domain.SLA in project mantis by Netflix.

the class JobClusterActor method initRunningJobs.

/**
 * Iterate through list of jobs in Active jobs table.
 * if a Job is completed move it completed table
 * else bootstrap the job (create actor, send init request)
 * Finally setup sla enforcement
 * @param initReq
 * @param sender
 */
private void initRunningJobs(JobClusterProto.InitializeJobClusterRequest initReq, ActorRef sender) {
    List<CompletedJob> completedJobsList = initReq.completedJobsList;
    List<IMantisJobMetadata> jobList = initReq.jobList;
    logger.info("In _initJobs for cluster {}: {} activeJobs and {} completedJobs", name, jobList.size(), completedJobsList.size());
    if (logger.isDebugEnabled()) {
        logger.debug("In _initJobs for cluster {} activeJobs -> {} and completedJobs -> {}", name, jobList, completedJobsList);
    }
    Observable.from(jobList).flatMap((jobMeta) -> {
        if (JobState.isTerminalState(jobMeta.getState())) {
            jobManager.persistToCompletedJobAndArchiveJobTables(jobMeta);
            return Observable.empty();
        } else {
            if (jobMeta.getSchedulingInfo() == null) {
                logger.error("Scheduling info is null for active job {} in cluster {}." + "Skipping bootstrap ", jobMeta.getJobId(), name);
                return Observable.empty();
            } else {
                return Observable.just(jobMeta);
            }
        }
    }).flatMap((jobMeta) -> jobManager.bootstrapJob((MantisJobMetadataImpl) jobMeta, this.jobClusterMetadata)).subscribe((jobInited) -> {
        logger.info("Job Id {} initialized with code {}", jobInited.jobId, jobInited.responseCode);
    }, (error) -> logger.warn("Exception initializing jobs {}", error.getMessage()), () -> {
        if (initReq.jobList.size() > 0) {
            JobId lastJobId = new JobId(this.name, initReq.lastJobNumber);
            this.jobIdSubmissionSubject.onNext(lastJobId);
        }
        setBookkeepingTimer(BOOKKEEPING_INTERVAL_SECS);
        getContext().become(initializedBehavior);
        logger.info("Job Cluster {} initialized", this.name);
        sender.tell(new JobClusterProto.InitializeJobClusterResponse(initReq.requestId, SUCCESS, String.format("JobCluster %s initialized successfully", initReq.jobClusterDefinition.getName()), initReq.jobClusterDefinition.getName(), initReq.requestor), getSelf());
    });
}
Also used : JobId(io.mantisrx.server.master.domain.JobId) Terminated(akka.actor.Terminated) Optional.of(java.util.Optional.of) MantisJobStore(io.mantisrx.server.master.persistence.MantisJobStore) Action1(rx.functions.Action1) TriggerOperator(com.netflix.fenzo.triggers.TriggerOperator) GetLastSubmittedJobIdStreamResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamResponse) JobActor(io.mantisrx.master.jobcluster.job.JobActor) ListArchivedWorkersRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersRequest) ActorRef(akka.actor.ActorRef) UpdateJobClusterLabelsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterLabelsResponse) UpdateJobClusterArtifactResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactResponse) Duration(java.time.Duration) Map(java.util.Map) CronPolicy(io.mantisrx.server.master.domain.IJobClusterDefinition.CronPolicy) Schedulers(rx.schedulers.Schedulers) Metrics(io.mantisrx.common.metrics.Metrics) Preconditions(com.netflix.spectator.impl.Preconditions) DisableJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterResponse) ListCompletedJobsInClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListCompletedJobsInClusterResponse) JobDefinition(io.mantisrx.server.master.domain.JobDefinition) Set(java.util.Set) ListCompletedJobsInClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListCompletedJobsInClusterRequest) ScaleStageRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageRequest) JobStartedEvent(io.mantisrx.master.jobcluster.proto.JobClusterProto.JobStartedEvent) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) CompletionStage(java.util.concurrent.CompletionStage) UpdateJobClusterWorkerMigrationStrategyRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyRequest) TriggerNotFoundException(com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException) JobHelper(io.mantisrx.master.jobcluster.job.JobHelper) ListJobIdsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobIdsRequest) BehaviorSubject(rx.subjects.BehaviorSubject) ListWorkersRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListWorkersRequest) GetJobDefinitionUpdatedFromJobActorRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDefinitionUpdatedFromJobActorRequest) Optional.empty(java.util.Optional.empty) SERVER_ERROR(io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.SERVER_ERROR) CronTrigger(com.netflix.fenzo.triggers.CronTrigger) MetricsRegistry(io.mantisrx.common.metrics.MetricsRegistry) JobConstraints(io.mantisrx.runtime.JobConstraints) LabelUtils(com.mantisrx.common.utils.LabelUtils) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) ListJobIdsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobIdsResponse) SupervisorStrategy(akka.actor.SupervisorStrategy) JobClusterDefinitionImpl(io.mantisrx.server.master.domain.JobClusterDefinitionImpl) WorkerEvent(io.mantisrx.server.master.scheduler.WorkerEvent) GetLastSubmittedJobIdStreamRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLastSubmittedJobIdStreamRequest) Label(io.mantisrx.common.Label) EnableJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.EnableJobClusterRequest) GetJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterRequest) UpdateJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterRequest) MANTIS_MASTER_USER(io.mantisrx.master.StringConstants.MANTIS_MASTER_USER) CLIENT_ERROR_NOT_FOUND(io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.CLIENT_ERROR_NOT_FOUND) UpdateJobClusterWorkerMigrationStrategyResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterWorkerMigrationStrategyResponse) CompletedJob(io.mantisrx.server.master.domain.JobClusterDefinitionImpl.CompletedJob) IMantisJobMetadata(io.mantisrx.master.jobcluster.job.IMantisJobMetadata) KillJobResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.KillJobResponse) SchedulerException(com.netflix.fenzo.triggers.exceptions.SchedulerException) IMantisWorkerMetadata(io.mantisrx.master.jobcluster.job.worker.IMantisWorkerMetadata) ListJobsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobsResponse) EnableJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.EnableJobClusterResponse) IJobClusterDefinition(io.mantisrx.server.master.domain.IJobClusterDefinition) JobState(io.mantisrx.master.jobcluster.job.JobState) GetJobSchedInfoRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobSchedInfoRequest) LoggerFactory(org.slf4j.LoggerFactory) GaugeCallback(io.mantisrx.common.metrics.spectator.GaugeCallback) StageSchedulingInfo(io.mantisrx.runtime.descriptor.StageSchedulingInfo) JobCompletedReason(io.mantisrx.server.core.JobCompletedReason) InvalidJobRequest(io.mantisrx.server.master.InvalidJobRequest) MantisScheduler(io.mantisrx.server.master.scheduler.MantisScheduler) ListJobsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobsRequest) CLIENT_ERROR(io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.CLIENT_ERROR) GetJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobClusterResponse) JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) GetLatestJobDiscoveryInfoResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLatestJobDiscoveryInfoResponse) GetJobDetailsResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsResponse) BasicTag(com.netflix.spectator.api.BasicTag) JobProto(io.mantisrx.master.jobcluster.proto.JobProto) ResubmitWorkerResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerResponse) ConstraintsEvaluators(io.mantisrx.server.master.ConstraintsEvaluators) DisableJobClusterRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterRequest) ResubmitWorkerRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ResubmitWorkerRequest) GetJobDefinitionUpdatedFromJobActorResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDefinitionUpdatedFromJobActorResponse) UpdateJobClusterLabelsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterLabelsRequest) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) SLA(io.mantisrx.server.master.domain.SLA) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) GetJobSchedInfoResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobSchedInfoResponse) Objects(java.util.Objects) List(java.util.List) PatternsCS.ask(akka.pattern.PatternsCS.ask) GetJobDetailsRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetJobDetailsRequest) SubmitJobResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobResponse) ConfigurationProvider(io.mantisrx.server.master.config.ConfigurationProvider) Optional(java.util.Optional) Props(akka.actor.Props) UpdateJobClusterSLARequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLARequest) MantisJobMetadataView(io.mantisrx.master.jobcluster.job.MantisJobMetadataView) ScaleStageResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ScaleStageResponse) UpdateJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterResponse) JobIdInfo(io.mantisrx.master.api.akka.route.proto.JobClusterProtoAdapter.JobIdInfo) KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest) ListArchivedWorkersResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListArchivedWorkersResponse) MantisJobMetadataImpl(io.mantisrx.master.jobcluster.job.MantisJobMetadataImpl) JobSla(io.mantisrx.runtime.JobSla) HashMap(java.util.HashMap) UpdateJobClusterSLAResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLAResponse) MantisActorSupervisorStrategy(io.mantisrx.master.akka.MantisActorSupervisorStrategy) ConcurrentMap(java.util.concurrent.ConcurrentMap) Observable(rx.Observable) HashSet(java.util.HashSet) LifecycleEventsProto(io.mantisrx.master.events.LifecycleEventsProto) DeleteJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DeleteJobClusterResponse) ListWorkersResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListWorkersResponse) AbstractActorWithTimers(akka.actor.AbstractActorWithTimers) JobClusterAlreadyExistsException(io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException) Counter(io.mantisrx.common.metrics.Counter) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Optional.ofNullable(java.util.Optional.ofNullable) ListJobCriteria(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.ListJobCriteria) GetLatestJobDiscoveryInfoRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.GetLatestJobDiscoveryInfoRequest) JobClusterConfig(io.mantisrx.server.master.domain.JobClusterConfig) SUCCESS(io.mantisrx.master.jobcluster.proto.BaseResponse.ResponseCode.SUCCESS) JobClusterManagerProto(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto) Lists(io.mantisrx.shaded.com.google.common.collect.Lists) UpdateJobClusterArtifactRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterArtifactRequest) Collections(java.util.Collections) LifecycleEventPublisher(io.mantisrx.master.events.LifecycleEventPublisher) JobClustersManagerInitializeResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.JobClustersManagerInitializeResponse) MetricGroupId(io.mantisrx.common.metrics.spectator.MetricGroupId) JobClusterProto(io.mantisrx.master.jobcluster.proto.JobClusterProto) CompletedJob(io.mantisrx.server.master.domain.JobClusterDefinitionImpl.CompletedJob) IMantisJobMetadata(io.mantisrx.master.jobcluster.job.IMantisJobMetadata) JobId(io.mantisrx.server.master.domain.JobId)

Example 3 with SLA

use of io.mantisrx.server.master.domain.SLA in project mantis by Netflix.

the class SLAEnforcerTest method slaMinTest.

@Test
public void slaMinTest() {
    int min = 2;
    int max = 10;
    SLA sla = new SLA(min, max, null, null);
    SLAEnforcer slaEnf = new SLAEnforcer(sla);
    // min is 2 and active jobs count is 2 no need to launch any jobs
    assertEquals(0, slaEnf.enforceSLAMin(2, 0));
    // min is 2 and active jobs is 1 and launched jobs is 1 no need to launch any more jobs
    assertEquals(0, slaEnf.enforceSLAMin(1, 1));
    // min is 2, active = 1, launched = 0, therefore launch 1 job
    assertEquals(1, slaEnf.enforceSLAMin(1, 0));
}
Also used : SLA(io.mantisrx.server.master.domain.SLA) Test(org.junit.Test)

Example 4 with SLA

use of io.mantisrx.server.master.domain.SLA in project mantis by Netflix.

the class SLAEnforcerTest method testSorting.

@Test
public void testSorting() {
    Instant now = Instant.now();
    List<JobInfo> jobList = Lists.newArrayList(new JobInfo(new JobId("cname", 3), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 1), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 4), null, now.getMillis(), null, JobState.Launched, null), new JobInfo(new JobId("cname", 2), null, now.getMillis(), null, JobState.Launched, null));
    int min = 1;
    int max = 1;
    SLA sla = new SLA(min, max, null, null);
    SLAEnforcer slaEnforcer = new SLAEnforcer(sla);
    Set<JobInfo> sortJobsByIdDesc = slaEnforcer.sortJobsByIdDesc(jobList);
    String[] expectedOrder = { "cname-1", "cname-2", "cname-3", "cname-4" };
    JobInfo[] jobIdArray = sortJobsByIdDesc.toArray(new JobInfo[sortJobsByIdDesc.size()]);
    for (int i = 0; i < jobIdArray.length; i++) {
        System.out.println("[" + i + "] ->" + jobIdArray[i]);
        assertEquals(expectedOrder[i], (jobIdArray[i].jobId.getId()));
    }
}
Also used : JobInfo(io.mantisrx.master.jobcluster.JobClusterActor.JobInfo) Instant(org.joda.time.Instant) SLA(io.mantisrx.server.master.domain.SLA) JobId(io.mantisrx.server.master.domain.JobId) Test(org.junit.Test)

Example 5 with SLA

use of io.mantisrx.server.master.domain.SLA in project mantis by Netflix.

the class SLAEnforcerTest method slaMaxDefaultsTest.

@Test
public void slaMaxDefaultsTest() {
    Instant now = Instant.now();
    int min = 0;
    int max = 0;
    SLA sla = new SLA(min, max, null, null);
    SLAEnforcer slaEnf = new SLAEnforcer(null);
    List<JobInfo> jobList = Lists.newArrayList(new JobInfo(new JobId("cname", 1), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 2), null, now.getMillis(), null, JobState.Launched, null), new JobInfo(new JobId("cname", 3), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 4), null, now.getMillis(), null, JobState.Launched, null));
    // sla not set nothing to enforce
    try {
        List<JobId> jobsToDelete = slaEnf.enforceSLAMax(jobList);
        assertTrue(jobsToDelete.isEmpty());
    } catch (Exception e) {
        fail();
    }
    slaEnf = new SLAEnforcer(sla);
    jobList = Lists.newArrayList(new JobInfo(new JobId("cname", 1), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 2), null, now.getMillis(), null, JobState.Launched, null), new JobInfo(new JobId("cname", 3), null, now.getMillis(), null, JobState.Accepted, null), new JobInfo(new JobId("cname", 4), null, now.getMillis(), null, JobState.Launched, null));
    // sla max is 0 nothing to enforce
    List<JobId> jobsToDelete = slaEnf.enforceSLAMax(jobList);
    assertTrue(jobsToDelete.isEmpty());
}
Also used : JobInfo(io.mantisrx.master.jobcluster.JobClusterActor.JobInfo) Instant(org.joda.time.Instant) SLA(io.mantisrx.server.master.domain.SLA) JobId(io.mantisrx.server.master.domain.JobId) Test(org.junit.Test)

Aggregations

SLA (io.mantisrx.server.master.domain.SLA)13 Test (org.junit.Test)10 JobId (io.mantisrx.server.master.domain.JobId)7 JobInfo (io.mantisrx.master.jobcluster.JobClusterActor.JobInfo)6 Instant (org.joda.time.Instant)6 SchedulerException (com.netflix.fenzo.triggers.exceptions.SchedulerException)3 TriggerNotFoundException (com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException)3 JobClusterAlreadyExistsException (io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException)3 ActorRef (akka.actor.ActorRef)2 LifecycleEventsProto (io.mantisrx.master.events.LifecycleEventsProto)2 IMantisJobMetadata (io.mantisrx.master.jobcluster.job.IMantisJobMetadata)2 UpdateJobClusterSLAResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.UpdateJobClusterSLAResponse)2 JobClusterDefinitionImpl (io.mantisrx.server.master.domain.JobClusterDefinitionImpl)2 AbstractActorWithTimers (akka.actor.AbstractActorWithTimers)1 Props (akka.actor.Props)1 SupervisorStrategy (akka.actor.SupervisorStrategy)1 Terminated (akka.actor.Terminated)1 PatternsCS.ask (akka.pattern.PatternsCS.ask)1 LabelUtils (com.mantisrx.common.utils.LabelUtils)1 CronTrigger (com.netflix.fenzo.triggers.CronTrigger)1