Search in sources :

Example 1 with KillJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest in project mantis by Netflix.

the class JobClusterActor method onEnforceSLARequest.

@Override
public void onEnforceSLARequest(JobClusterProto.EnforceSLARequest request) {
    if (logger.isTraceEnabled()) {
        logger.trace("Enter onEnforceSLA for JobCluster {} with request", this.name, request);
    }
    numSLAEnforcementExecutions.increment();
    long now = request.timeOfEnforcement.toEpochMilli();
    List<JobInfo> pendingInitializationJobsPriorToCutoff = jobManager.getJobActorsStuckInInit(now, getExpirePendingInitializeDelayMs());
    List<JobInfo> jobsStuckInAcceptedList = jobManager.getJobsStuckInAccepted(now, getExpireAcceptedDelayMs());
    List<JobInfo> jobsStuckInTerminatingList = jobManager.getJobsStuckInTerminating(now, getExpireAcceptedDelayMs());
    if (!slaEnforcer.hasSLA()) {
        return;
    }
    int activeJobsCount = jobManager.activeJobsCount();
    int acceptedJobsCount = jobManager.acceptedJobsCount();
    // enforcing min
    int noOfJobsToLaunch = slaEnforcer.enforceSLAMin(activeJobsCount, acceptedJobsCount);
    if (noOfJobsToLaunch > 0) {
        logger.info("Submitting {} jobs for job name {} as active count is {} and accepted count is {}", noOfJobsToLaunch, name, activeJobsCount, acceptedJobsCount);
        String user = MANTIS_MASTER_USER;
        if (request.jobDefinitionOp.isPresent()) {
            user = request.jobDefinitionOp.get().getUser();
        }
        for (int i = 0; i < noOfJobsToLaunch; i++) {
            getSelf().tell(new SubmitJobRequest(name, user, true, request.jobDefinitionOp), getSelf());
        }
    // enforce max.
    } else {
        List<JobInfo> listOfJobs = new ArrayList<>(activeJobsCount + acceptedJobsCount);
        listOfJobs.addAll(jobManager.getActiveJobsList());
        listOfJobs.addAll(jobManager.getAcceptedJobsList());
        List<JobId> jobsToKill = slaEnforcer.enforceSLAMax(Collections.unmodifiableList(listOfJobs));
        for (JobId jobId : jobsToKill) {
            logger.info("Request termination for job {}", jobId);
            getSelf().tell(new KillJobRequest(jobId, "SLA enforcement", JobCompletedReason.Killed, MANTIS_MASTER_USER, ActorRef.noSender()), getSelf());
        }
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Exit onEnforceSLA for JobCluster {}", name);
    }
}
Also used : KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest) SubmitJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest) ArrayList(java.util.ArrayList) JobId(io.mantisrx.server.master.domain.JobId)

Example 2 with KillJobRequest

use of io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest in project mantis by Netflix.

the class JobClusterActor method onJobClusterDisable.

@Override
public void onJobClusterDisable(final DisableJobClusterRequest req) {
    if (logger.isTraceEnabled()) {
        logger.trace("Enter onJobClusterDisable {}", req);
    }
    ActorRef sender = getSender();
    try {
        IJobClusterMetadata jobClusterMetadata = new JobClusterMetadataImpl.Builder().withIsDisabled(true).withLastJobCount(this.jobClusterMetadata.getLastJobCount()).withJobClusterDefinition((JobClusterDefinitionImpl) this.jobClusterMetadata.getJobClusterDefinition()).build();
        // update store
        jobStore.updateJobCluster(jobClusterMetadata);
        this.jobClusterMetadata = jobClusterMetadata;
        cronManager.destroyCron();
        // change behavior to disabled
        getContext().become(disabledBehavior);
        // send kill requests for all non terminal jobs
        List<JobInfo> jobsToKill = new ArrayList<>();
        jobsToKill.addAll(jobManager.getAcceptedJobsList());
        jobsToKill.addAll(jobManager.getActiveJobsList());
        for (JobInfo jobInfo : jobsToKill) {
            jobInfo.jobActor.tell(new KillJobRequest(jobInfo.jobId, "Job cluster disabled", JobCompletedReason.Killed, req.getUser(), ActorRef.noSender()), getSelf());
        }
        // disable SLA check timers
        getTimers().cancel(BOOKKEEPING_TIMER_KEY);
        eventPublisher.publishAuditEvent(new LifecycleEventsProto.AuditEvent(LifecycleEventsProto.AuditEvent.AuditEventType.JOB_CLUSTER_DISABLED, jobClusterMetadata.getJobClusterDefinition().getName(), name + " disabled"));
        sender.tell(new DisableJobClusterResponse(req.requestId, SUCCESS, String.format("%s disabled", name)), getSelf());
        numJobClusterDisable.increment();
        logger.info("Job Cluster {} is disabbled", this.name);
    } catch (Exception e) {
        String errorMsg = "Exception disabling cluster " + name + " due to " + e.getMessage();
        logger.error(errorMsg, e);
        sender.tell(new DisableJobClusterResponse(req.requestId, SERVER_ERROR, errorMsg), getSelf());
        numJobClusterDisableErrors.increment();
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Exit onJobClusterDisable");
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobClusterDefinitionImpl(io.mantisrx.server.master.domain.JobClusterDefinitionImpl) ArrayList(java.util.ArrayList) TriggerNotFoundException(com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException) SchedulerException(com.netflix.fenzo.triggers.exceptions.SchedulerException) JobClusterAlreadyExistsException(io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException) DisableJobClusterResponse(io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterResponse) KillJobRequest(io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest) LifecycleEventsProto(io.mantisrx.master.events.LifecycleEventsProto)

Aggregations

KillJobRequest (io.mantisrx.master.jobcluster.proto.JobClusterProto.KillJobRequest)2 ArrayList (java.util.ArrayList)2 ActorRef (akka.actor.ActorRef)1 SchedulerException (com.netflix.fenzo.triggers.exceptions.SchedulerException)1 TriggerNotFoundException (com.netflix.fenzo.triggers.exceptions.TriggerNotFoundException)1 LifecycleEventsProto (io.mantisrx.master.events.LifecycleEventsProto)1 DisableJobClusterResponse (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.DisableJobClusterResponse)1 SubmitJobRequest (io.mantisrx.master.jobcluster.proto.JobClusterManagerProto.SubmitJobRequest)1 JobClusterDefinitionImpl (io.mantisrx.server.master.domain.JobClusterDefinitionImpl)1 JobId (io.mantisrx.server.master.domain.JobId)1 JobClusterAlreadyExistsException (io.mantisrx.server.master.persistence.exceptions.JobClusterAlreadyExistsException)1