use of org.apache.airavata.gfac.core.cluster.RemoteCluster in project airavata by apache.
the class Factory method getDataMovementRemoteCluster.
public static RemoteCluster getDataMovementRemoteCluster(ProcessContext processContext) throws GFacException, AiravataException, CredentialStoreException {
String storageResourceId = processContext.getStorageResourceId();
DataMovementProtocol dataMovementProtocol = processContext.getDataMovementProtocol();
String key = new StringBuilder(processContext.getComputeResourceLoginUserName()).append(':').append(dataMovementProtocol.name()).append(':').append(storageResourceId).append(":").append(processContext.getStorageResourceCredentialToken()).toString();
RemoteCluster remoteCluster = remoteClusterMap.get(key);
if (remoteCluster == null) {
JobManagerConfiguration jobManagerConfiguration = getJobManagerConfiguration(processContext.getResourceJobManager());
if (dataMovementProtocol == DataMovementProtocol.LOCAL) {
remoteCluster = new LocalRemoteCluster(processContext.getStorageResourceServerInfo(), jobManagerConfiguration, null);
} else if (dataMovementProtocol == DataMovementProtocol.SCP) {
remoteCluster = new HPCRemoteCluster(processContext.getStorageResourceServerInfo(), jobManagerConfiguration, Factory.getStorageSSHKeyAuthentication(processContext));
} else {
throw new GFacException("No remote cluster implementation map to job data movement protocol " + dataMovementProtocol.name());
}
remoteClusterMap.put(key, remoteCluster);
} else {
AuthenticationInfo authentication = remoteCluster.getAuthentication();
if (authentication instanceof SSHKeyAuthentication) {
SSHKeyAuthentication sshKeyAuthentication = (SSHKeyAuthentication) authentication;
if (!sshKeyAuthentication.getUserName().equals(processContext.getStorageResourceLoginUserName())) {
JobManagerConfiguration jobManagerConfiguration = getJobManagerConfiguration(processContext.getResourceJobManager());
dataMovementProtocol = processContext.getDataMovementProtocol();
if (dataMovementProtocol == DataMovementProtocol.SCP) {
remoteCluster = new HPCRemoteCluster(processContext.getStorageResourceServerInfo(), jobManagerConfiguration, Factory.getStorageSSHKeyAuthentication(processContext));
}
}
}
}
return remoteCluster;
}
use of org.apache.airavata.gfac.core.cluster.RemoteCluster in project airavata by apache.
the class Factory method getJobSubmissionRemoteCluster.
/**
* Factory class manage reomete cluster map, this will solve too many connections/ sessions issues with cluster
* communications.
* @param processContext
* @return
* @throws GFacException
* @throws AppCatalogException
* @throws AiravataException
*/
public static RemoteCluster getJobSubmissionRemoteCluster(ProcessContext processContext) throws GFacException, AppCatalogException, AiravataException, CredentialStoreException {
String computeResourceId = processContext.getComputeResourceId();
JobSubmissionProtocol jobSubmissionProtocol = processContext.getJobSubmissionProtocol();
String key = new StringBuilder(processContext.getComputeResourceLoginUserName()).append(':').append(jobSubmissionProtocol.name()).append(':').append(computeResourceId).append(':').append(processContext.getComputeResourceCredentialToken()).toString();
RemoteCluster remoteCluster = remoteClusterMap.get(key);
if (remoteCluster == null) {
JobManagerConfiguration jobManagerConfiguration = getJobManagerConfiguration(processContext.getResourceJobManager());
if (jobSubmissionProtocol == JobSubmissionProtocol.LOCAL || jobSubmissionProtocol == JobSubmissionProtocol.LOCAL_FORK) {
remoteCluster = new LocalRemoteCluster(processContext.getComputeResourceServerInfo(), jobManagerConfiguration, null);
} else if (jobSubmissionProtocol == JobSubmissionProtocol.SSH || jobSubmissionProtocol == JobSubmissionProtocol.SSH_FORK || jobSubmissionProtocol == JobSubmissionProtocol.CLOUD) {
remoteCluster = new HPCRemoteCluster(processContext.getComputeResourceServerInfo(), jobManagerConfiguration, Factory.getComputerResourceSSHKeyAuthentication(processContext));
} else {
throw new GFacException("No remote cluster implementation map to job submission protocol " + jobSubmissionProtocol.name());
}
remoteClusterMap.put(key, remoteCluster);
} else {
AuthenticationInfo authentication = remoteCluster.getAuthentication();
if (authentication instanceof SSHKeyAuthentication) {
SSHKeyAuthentication sshKeyAuthentication = (SSHKeyAuthentication) authentication;
if (!sshKeyAuthentication.getUserName().equals(processContext.getComputeResourceLoginUserName())) {
JobManagerConfiguration jobManagerConfiguration = getJobManagerConfiguration(processContext.getResourceJobManager());
if (jobSubmissionProtocol == JobSubmissionProtocol.SSH || jobSubmissionProtocol == JobSubmissionProtocol.SSH_FORK) {
remoteCluster = new HPCRemoteCluster(processContext.getComputeResourceServerInfo(), jobManagerConfiguration, Factory.getComputerResourceSSHKeyAuthentication(processContext));
}
}
}
}
return remoteCluster;
}
use of org.apache.airavata.gfac.core.cluster.RemoteCluster in project airavata by apache.
the class DefaultJobSubmissionTask method cancel.
@Override
public JobStatus cancel(TaskContext taskcontext) throws TaskException {
ProcessContext processContext = taskcontext.getParentProcessContext();
RemoteCluster remoteCluster = processContext.getJobSubmissionRemoteCluster();
JobModel jobModel = processContext.getJobModel();
int retryCount = 0;
if (jobModel != null) {
if (processContext.getProcessState() == ProcessState.EXECUTING) {
while (jobModel.getJobId() == null) {
log.info("Cancellation pause {} secs until process get jobId", pauseTimeInSec);
try {
Thread.sleep(waitForProcessIdmillis);
} catch (InterruptedException e) {
// ignore
}
}
}
try {
JobStatus oldJobStatus = remoteCluster.getJobStatus(jobModel.getJobId());
while (oldJobStatus == null && retryCount <= 5) {
retryCount++;
Thread.sleep(retryCount * 1000);
oldJobStatus = remoteCluster.getJobStatus(jobModel.getJobId());
}
if (oldJobStatus != null) {
oldJobStatus = remoteCluster.cancelJob(jobModel.getJobId());
return oldJobStatus;
} else {
throw new TaskException("Cancel operation failed, Job status couldn't find in resource, JobId " + jobModel.getJobId());
}
} catch (GFacException | InterruptedException e) {
throw new TaskException("Error while cancelling job " + jobModel.getJobId(), e);
}
} else {
throw new TaskException("Couldn't complete cancel operation, JobModel is null in ProcessContext.");
}
}
use of org.apache.airavata.gfac.core.cluster.RemoteCluster in project airavata by apache.
the class EnvironmentSetupTask method execute.
@Override
public TaskStatus execute(TaskContext taskContext) {
TaskStatus status = new TaskStatus(TaskState.COMPLETED);
try {
RemoteCluster remoteCluster = taskContext.getParentProcessContext().getJobSubmissionRemoteCluster();
remoteCluster.makeDirectory(taskContext.getParentProcessContext().getWorkingDir());
status.setReason("Successfully created environment");
} catch (GFacException e) {
String msg = "Error while environment setup";
log.error(msg, e);
status.setState(TaskState.FAILED);
status.setReason(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(e.getMessage());
errorModel.setUserFriendlyMessage(msg);
taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
}
return status;
}
use of org.apache.airavata.gfac.core.cluster.RemoteCluster in project airavata by apache.
the class ForkJobSubmissionTask method execute.
@Override
public TaskStatus execute(TaskContext taskContext) {
TaskStatus taskStatus = new TaskStatus(TaskState.CREATED);
try {
ProcessContext processContext = taskContext.getParentProcessContext();
JobModel jobModel = processContext.getJobModel();
jobModel.setTaskId(taskContext.getTaskId());
RemoteCluster remoteCluster = processContext.getJobSubmissionRemoteCluster();
GroovyMap groovyMap = GFacUtils.createGroovyMap(processContext, taskContext);
jobModel.setJobName(groovyMap.get(Script.JOB_NAME).toString());
ResourceJobManager resourceJobManager = GFacUtils.getResourceJobManager(processContext);
JobManagerConfiguration jConfig = null;
if (resourceJobManager != null) {
jConfig = Factory.getJobManagerConfiguration(resourceJobManager);
}
JobStatus jobStatus = new JobStatus();
File jobFile = GFacUtils.createJobFile(groovyMap, taskContext, jConfig);
if (jobFile != null && jobFile.exists()) {
jobModel.setJobDescription(FileUtils.readFileToString(jobFile));
JobSubmissionOutput jobSubmissionOutput = remoteCluster.submitBatchJob(jobFile.getPath(), processContext.getWorkingDir());
jobModel.setExitCode(jobSubmissionOutput.getExitCode());
jobModel.setStdErr(jobSubmissionOutput.getStdErr());
jobModel.setStdOut(jobSubmissionOutput.getStdOut());
String jobId = jobSubmissionOutput.getJobId();
if (jobId != null && !jobId.isEmpty()) {
jobModel.setJobId(jobId);
GFacUtils.saveJobModel(processContext, jobModel);
jobStatus.setJobState(JobState.SUBMITTED);
jobStatus.setReason("Successfully Submitted to " + taskContext.getParentProcessContext().getComputeResourceDescription().getHostName());
jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime());
jobModel.setJobStatuses(Arrays.asList(jobStatus));
GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel);
taskStatus = new TaskStatus(TaskState.COMPLETED);
taskStatus.setReason("Submitted job to compute resource");
}
if (jobId == null || jobId.isEmpty()) {
String msg = "expId:" + processContext.getProcessModel().getExperimentId() + " Couldn't find " + "remote jobId for JobName:" + jobModel.getJobName() + ", both submit and verify steps " + "doesn't return a valid JobId. " + "Hence changing experiment state to Failed";
log.error(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(msg);
errorModel.setCreationTime(AiravataUtils.getCurrentTimestamp().getTime());
GFacUtils.saveExperimentError(processContext, errorModel);
GFacUtils.saveProcessError(processContext, errorModel);
GFacUtils.saveTaskError(taskContext, errorModel);
taskStatus.setState(TaskState.FAILED);
taskStatus.setReason("Couldn't find job id in both submitted and verified steps");
} else {
GFacUtils.saveJobModel(processContext, jobModel);
}
} else {
taskStatus.setState(TaskState.FAILED);
if (jobFile == null) {
taskStatus.setReason("JobFile is null");
} else {
taskStatus.setReason("Job file doesn't exist");
}
}
} catch (ApplicationSettingsException e) {
String msg = "Error occurred while creating job descriptor";
log.error(msg, e);
taskStatus.setState(TaskState.FAILED);
taskStatus.setReason(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(e.getMessage());
errorModel.setUserFriendlyMessage(msg);
taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
} catch (AppCatalogException e) {
String msg = "Error while instantiating app catalog";
log.error(msg, e);
taskStatus.setState(TaskState.FAILED);
taskStatus.setReason(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(e.getMessage());
errorModel.setUserFriendlyMessage(msg);
taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
} catch (GFacException e) {
String msg = "Error occurred while submitting the job";
log.error(msg, e);
taskStatus.setState(TaskState.FAILED);
taskStatus.setReason(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(e.getMessage());
errorModel.setUserFriendlyMessage(msg);
taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
} catch (IOException e) {
String msg = "Error while reading the content of the job file";
log.error(msg, e);
taskStatus.setState(TaskState.FAILED);
taskStatus.setReason(msg);
ErrorModel errorModel = new ErrorModel();
errorModel.setActualErrorMessage(e.getMessage());
errorModel.setUserFriendlyMessage(msg);
taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel));
}
return taskStatus;
}
Aggregations