Search in sources :

Example 16 with ResourceJobManager

use of org.apache.airavata.model.appcatalog.computeresource.ResourceJobManager in project airavata by apache.

the class DefaultJobSubmissionTask method execute.

@Override
public TaskStatus execute(TaskContext taskContext) {
    // set to completed.
    TaskStatus taskStatus = new TaskStatus(TaskState.COMPLETED);
    try {
        ProcessContext processContext = taskContext.getParentProcessContext();
        JobModel jobModel = processContext.getJobModel();
        jobModel.setTaskId(taskContext.getTaskId());
        RemoteCluster remoteCluster = processContext.getJobSubmissionRemoteCluster();
        GroovyMap groovyMap = GFacUtils.createGroovyMap(processContext, taskContext);
        groovyMap.getStringValue(Script.JOB_NAME).ifPresent(jobName -> jobModel.setJobName(jobName));
        ResourceJobManager resourceJobManager = GFacUtils.getResourceJobManager(processContext);
        JobManagerConfiguration jConfig = null;
        if (resourceJobManager != null) {
            jConfig = Factory.getJobManagerConfiguration(resourceJobManager);
        }
        JobStatus jobStatus = new JobStatus();
        File jobFile = GFacUtils.createJobFile(groovyMap, taskContext, jConfig);
        if (jobFile != null && jobFile.exists()) {
            jobModel.setJobDescription(FileUtils.readFileToString(jobFile));
            JobSubmissionOutput jobSubmissionOutput = remoteCluster.submitBatchJob(jobFile.getPath(), processContext.getWorkingDir());
            int exitCode = jobSubmissionOutput.getExitCode();
            jobModel.setExitCode(exitCode);
            jobModel.setStdErr(jobSubmissionOutput.getStdErr());
            jobModel.setStdOut(jobSubmissionOutput.getStdOut());
            String jobId = jobSubmissionOutput.getJobId();
            String experimentId = taskContext.getExperimentId();
            if (exitCode != 0 || jobSubmissionOutput.isJobSubmissionFailed()) {
                jobModel.setJobId(DEFAULT_JOB_ID);
                if (jobSubmissionOutput.isJobSubmissionFailed()) {
                    List<JobStatus> statusList = new ArrayList<>();
                    statusList.add(new JobStatus(JobState.FAILED));
                    statusList.get(0).setReason(jobSubmissionOutput.getFailureReason());
                    jobModel.setJobStatuses(statusList);
                    GFacUtils.saveJobModel(processContext, jobModel);
                    log.error("expId: {}, processid: {}, taskId: {} :- Job submission failed for job name {}", experimentId, taskContext.getProcessId(), taskContext.getTaskId(), jobModel.getJobName());
                    ErrorModel errorModel = new ErrorModel();
                    errorModel.setUserFriendlyMessage(jobSubmissionOutput.getFailureReason());
                    errorModel.setActualErrorMessage(jobSubmissionOutput.getFailureReason());
                    GFacUtils.saveExperimentError(processContext, errorModel);
                    GFacUtils.saveProcessError(processContext, errorModel);
                    GFacUtils.saveTaskError(taskContext, errorModel);
                    taskStatus.setState(TaskState.FAILED);
                    taskStatus.setReason("Job submission command didn't return a jobId");
                    taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                    taskContext.setTaskStatus(taskStatus);
                } else {
                    String msg;
                    GFacUtils.saveJobModel(processContext, jobModel);
                    ErrorModel errorModel = new ErrorModel();
                    if (exitCode != Integer.MIN_VALUE) {
                        msg = "expId:" + processContext.getProcessModel().getExperimentId() + ", processId:" + processContext.getProcessId() + ", taskId: " + taskContext.getTaskId() + " return non zero exit code:" + exitCode + "  for JobName:" + jobModel.getJobName() + ", with failure reason : " + jobSubmissionOutput.getFailureReason() + " Hence changing job state to Failed.";
                        errorModel.setActualErrorMessage(jobSubmissionOutput.getFailureReason());
                    } else {
                        msg = "expId:" + processContext.getProcessModel().getExperimentId() + ", processId:" + processContext.getProcessId() + ", taskId: " + taskContext.getTaskId() + " doesn't  return valid job submission exit code for JobName:" + jobModel.getJobName() + ", with failure reason : stdout ->" + jobSubmissionOutput.getStdOut() + " stderr -> " + jobSubmissionOutput.getStdErr() + " Hence changing job state to Failed.";
                        errorModel.setActualErrorMessage(msg);
                    }
                    log.error(msg);
                    errorModel.setUserFriendlyMessage(msg);
                    GFacUtils.saveExperimentError(processContext, errorModel);
                    GFacUtils.saveProcessError(processContext, errorModel);
                    GFacUtils.saveTaskError(taskContext, errorModel);
                    taskStatus.setState(TaskState.FAILED);
                    taskStatus.setReason(msg);
                    taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                    taskContext.setTaskStatus(taskStatus);
                }
                try {
                    GFacUtils.saveAndPublishTaskStatus(taskContext);
                } catch (GFacException e) {
                    log.error("Error while saving task status", e);
                }
                return taskStatus;
            } else if (jobId != null && !jobId.isEmpty()) {
                jobModel.setJobId(jobId);
                GFacUtils.saveJobModel(processContext, jobModel);
                jobStatus.setJobState(JobState.SUBMITTED);
                ComputeResourceDescription computeResourceDescription = taskContext.getParentProcessContext().getComputeResourceDescription();
                jobStatus.setReason("Successfully Submitted to " + computeResourceDescription.getHostName());
                jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                jobModel.setJobStatuses(Arrays.asList(jobStatus));
                GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel);
                if (verifyJobSubmissionByJobId(remoteCluster, jobId)) {
                    jobStatus.setJobState(JobState.QUEUED);
                    jobStatus.setReason("Verification step succeeded");
                    jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                    jobModel.setJobStatuses(Arrays.asList(jobStatus));
                    GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel);
                }
                // doing gateway reporting
                if (computeResourceDescription.isGatewayUsageReporting()) {
                    String loadCommand = computeResourceDescription.getGatewayUsageModuleLoadCommand();
                    String usageExecutable = computeResourceDescription.getGatewayUsageExecutable();
                    ExperimentModel experiment = (ExperimentModel) taskContext.getParentProcessContext().getExperimentCatalog().get(ExperimentCatalogModelType.EXPERIMENT, experimentId);
                    String username = experiment.getUserName() + "@" + taskContext.getParentProcessContext().getUsageReportingGatewayId();
                    RawCommandInfo rawCommandInfo = new RawCommandInfo(loadCommand + " && " + usageExecutable + " -gateway_user " + username + " -submit_time \"`date '+%F %T %:z'`\"  -jobid " + jobId);
                    remoteCluster.execute(rawCommandInfo);
                }
                taskStatus = new TaskStatus(TaskState.COMPLETED);
                taskStatus.setReason("Submitted job to compute resource");
                taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
            } else {
                int verificationTryCount = 0;
                while (verificationTryCount++ < 3) {
                    String verifyJobId = verifyJobSubmission(remoteCluster, jobModel);
                    if (verifyJobId != null && !verifyJobId.isEmpty()) {
                        // JobStatus either changed from SUBMITTED to QUEUED or directly to QUEUED
                        jobId = verifyJobId;
                        jobModel.setJobId(jobId);
                        GFacUtils.saveJobModel(processContext, jobModel);
                        jobStatus.setJobState(JobState.QUEUED);
                        jobStatus.setReason("Verification step succeeded");
                        jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                        jobModel.setJobStatuses(Arrays.asList(jobStatus));
                        GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel);
                        taskStatus.setState(TaskState.COMPLETED);
                        taskStatus.setReason("Submitted job to compute resource");
                        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
                        break;
                    }
                    log.info("Verify step return invalid jobId, retry verification step in {} secs", verificationTryCount * 10);
                    Thread.sleep(verificationTryCount * 10000);
                }
            }
            if (jobId == null || jobId.isEmpty()) {
                jobModel.setJobId(DEFAULT_JOB_ID);
                GFacUtils.saveJobModel(processContext, jobModel);
                String msg = "expId:" + processContext.getProcessModel().getExperimentId() + " Couldn't find " + "remote jobId for JobName:" + jobModel.getJobName() + ", both submit and verify steps " + "doesn't return a valid JobId. " + "Hence changing experiment state to Failed";
                log.error(msg);
                ErrorModel errorModel = new ErrorModel();
                errorModel.setUserFriendlyMessage(msg);
                errorModel.setActualErrorMessage(msg);
                GFacUtils.saveExperimentError(processContext, errorModel);
                GFacUtils.saveProcessError(processContext, errorModel);
                GFacUtils.saveTaskError(taskContext, errorModel);
                taskStatus.setState(TaskState.FAILED);
                taskStatus.setReason("Couldn't find job id in both submitted and verified steps");
                taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
            } else {
                GFacUtils.saveJobModel(processContext, jobModel);
            }
        } else {
            taskStatus.setState(TaskState.FAILED);
            if (jobFile == null) {
                taskStatus.setReason("JobFile is null");
            } else {
                taskStatus.setReason("Job file doesn't exist");
            }
        }
    } catch (AppCatalogException e) {
        String msg = "Error while instantiating app catalog";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    } catch (ApplicationSettingsException e) {
        String msg = "Error occurred while creating job descriptor";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    } catch (GFacException e) {
        String msg = "Error occurred while submitting the job";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    } catch (IOException e) {
        String msg = "Error while reading the content of the job file";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    } catch (InterruptedException e) {
        String msg = "Error occurred while verifying the job submission";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    } catch (Throwable e) {
        String msg = "JobSubmission failed";
        log.error(msg, e);
        taskStatus.setState(TaskState.FAILED);
        taskStatus.setReason(msg);
        taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
        ErrorModel errorModel = new ErrorModel();
        errorModel.setActualErrorMessage(e.getMessage());
        errorModel.setUserFriendlyMessage(msg);
        taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
    }
    taskContext.setTaskStatus(taskStatus);
    try {
        GFacUtils.saveAndPublishTaskStatus(taskContext);
    } catch (GFacException e) {
        log.error("Error while saving task status", e);
    }
    return taskStatus;
}
Also used : ApplicationSettingsException(org.apache.airavata.common.exception.ApplicationSettingsException) ComputeResourceDescription(org.apache.airavata.model.appcatalog.computeresource.ComputeResourceDescription) RemoteCluster(org.apache.airavata.gfac.core.cluster.RemoteCluster) ArrayList(java.util.ArrayList) ExperimentModel(org.apache.airavata.model.experiment.ExperimentModel) IOException(java.io.IOException) ProcessContext(org.apache.airavata.gfac.core.context.ProcessContext) JobSubmissionOutput(org.apache.airavata.gfac.core.cluster.JobSubmissionOutput) AppCatalogException(org.apache.airavata.registry.cpi.AppCatalogException) ResourceJobManager(org.apache.airavata.model.appcatalog.computeresource.ResourceJobManager) ErrorModel(org.apache.airavata.model.commons.ErrorModel) JobModel(org.apache.airavata.model.job.JobModel) File(java.io.File) RawCommandInfo(org.apache.airavata.gfac.core.cluster.RawCommandInfo)

Aggregations

ResourceJobManager (org.apache.airavata.model.appcatalog.computeresource.ResourceJobManager)16 ComputeResourceDescription (org.apache.airavata.model.appcatalog.computeresource.ComputeResourceDescription)12 ApplicationDeploymentDescription (org.apache.airavata.model.appcatalog.appdeployment.ApplicationDeploymentDescription)11 ApplicationModule (org.apache.airavata.model.appcatalog.appdeployment.ApplicationModule)11 ApplicationInterfaceDescription (org.apache.airavata.model.appcatalog.appinterface.ApplicationInterfaceDescription)11 SSHJobSubmission (org.apache.airavata.model.appcatalog.computeresource.SSHJobSubmission)11 File (java.io.File)3 IOException (java.io.IOException)3 ApplicationSettingsException (org.apache.airavata.common.exception.ApplicationSettingsException)3 JobSubmissionOutput (org.apache.airavata.gfac.core.cluster.JobSubmissionOutput)3 RemoteCluster (org.apache.airavata.gfac.core.cluster.RemoteCluster)3 ProcessContext (org.apache.airavata.gfac.core.context.ProcessContext)3 ErrorModel (org.apache.airavata.model.commons.ErrorModel)3 JobModel (org.apache.airavata.model.job.JobModel)3 AppCatalogException (org.apache.airavata.registry.cpi.AppCatalogException)3 JobManagerCommand (org.apache.airavata.model.appcatalog.computeresource.JobManagerCommand)2 LOCALSubmission (org.apache.airavata.model.appcatalog.computeresource.LOCALSubmission)2 JobStatus (org.apache.airavata.model.status.JobStatus)2 TaskStatus (org.apache.airavata.model.status.TaskStatus)2 ArrayList (java.util.ArrayList)1