use of org.ow2.proactive.scheduler.job.InternalJob in project scheduling by ow2-proactive.
the class LiveJobs method restartAllInErrorTasks.
public Boolean restartAllInErrorTasks(JobId jobId) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return false;
}
try {
InternalJob job = jobData.job;
for (TaskState taskState : job.getTasks()) {
try {
restartInErrorTask(jobId, taskState.getName());
} catch (UnknownTaskException e) {
logger.error("", e);
jlogger.error(jobId, "", e);
tlogger.error(taskState.getId(), "", e);
}
}
setJobStatusToInErrorIfNotPaused(job);
dbManager.updateJobAndTasksState(job);
updateJobInSchedulerState(job, SchedulerEvent.JOB_RESTARTED_FROM_ERROR);
return Boolean.TRUE;
} finally {
jobData.unlock();
}
}
use of org.ow2.proactive.scheduler.job.InternalJob in project scheduling by ow2-proactive.
the class LiveJobs method jobRecovered.
void jobRecovered(InternalJob job) {
jobs.put(job.getId(), new JobData(job));
for (InternalTask task : job.getITasks()) {
if (task.getStatus() == TaskStatus.RUNNING) {
logger.info("Recover task " + task.getId() + " (" + task.getName() + ") of job " + job.getId() + " (" + job.getName() + ")");
runningTasksData.put(TaskIdWrapper.wrap(task.getId()), new RunningTaskData(task, job.getOwner(), job.getCredentials(), task.getExecuterInformation().getLauncher()));
}
}
}
use of org.ow2.proactive.scheduler.job.InternalJob in project scheduling by ow2-proactive.
the class LiveJobs method updateTasksInSchedulerState.
private void updateTasksInSchedulerState(InternalJob job, Set<TaskId> tasksToUpdate) {
for (TaskId tid : tasksToUpdate) {
try {
InternalTask t = job.getTask(tid);
TaskInfo ti = new TaskInfoImpl((TaskInfoImpl) t.getTaskInfo());
listener.taskStateUpdated(job.getOwner(), new NotificationData<>(SchedulerEvent.TASK_RUNNING_TO_FINISHED, ti));
} catch (UnknownTaskException e) {
logger.error(e);
}
}
}
use of org.ow2.proactive.scheduler.job.InternalJob in project scheduling by ow2-proactive.
the class SchedulingMethodImpl method selectAndStartTasks.
private int selectAndStartTasks(Policy currentPolicy, Map<JobId, JobDescriptor> jobMap, Set<String> freeResources, LinkedList<EligibleTaskDescriptor> fullListOfTaskRetrievedFromPolicy) {
int numberOfTaskStarted = 0;
VariableBatchSizeIterator progressiveIterator = new VariableBatchSizeIterator(fullListOfTaskRetrievedFromPolicy);
while (progressiveIterator.hasMoreElements() && !freeResources.isEmpty()) {
LinkedList<EligibleTaskDescriptor> taskRetrievedFromPolicy = new LinkedList<>(progressiveIterator.getNextElements(freeResources.size()));
if (logger.isDebugEnabled()) {
loggingEligibleTasksDetails(fullListOfTaskRetrievedFromPolicy, taskRetrievedFromPolicy);
}
updateVariablesForTasksToSchedule(taskRetrievedFromPolicy);
for (EligibleTaskDescriptor etd : taskRetrievedFromPolicy) {
// load and Initialize the executable container
loadAndInit(((EligibleTaskDescriptorImpl) etd).getInternal());
}
while (!taskRetrievedFromPolicy.isEmpty()) {
if (freeResources.isEmpty()) {
break;
}
// get the next compatible tasks from the whole returned policy tasks
LinkedList<EligibleTaskDescriptor> tasksToSchedule = new LinkedList<>();
int neededResourcesNumber = 0;
while (!taskRetrievedFromPolicy.isEmpty() && neededResourcesNumber == 0) {
// the loop will search for next compatible task until it find something
neededResourcesNumber = getNextcompatibleTasks(jobMap, taskRetrievedFromPolicy, freeResources.size(), tasksToSchedule);
}
if (logger.isDebugEnabled()) {
logger.debug("tasksToSchedule : " + tasksToSchedule);
}
logger.debug("required number of nodes : " + neededResourcesNumber);
if (neededResourcesNumber == 0 || tasksToSchedule.isEmpty()) {
break;
}
NodeSet nodeSet = getRMNodes(jobMap, neededResourcesNumber, tasksToSchedule, freeResources);
if (nodeSet != null) {
freeResources.removeAll(nodeSet.getAllNodesUrls());
}
// start selected tasks
Node node = null;
InternalJob currentJob = null;
try {
while (nodeSet != null && !nodeSet.isEmpty()) {
EligibleTaskDescriptor taskDescriptor = tasksToSchedule.removeFirst();
currentJob = ((JobDescriptorImpl) jobMap.get(taskDescriptor.getJobId())).getInternal();
InternalTask internalTask = ((EligibleTaskDescriptorImpl) taskDescriptor).getInternal();
if (currentPolicy.isTaskExecutable(nodeSet, taskDescriptor)) {
// create launcher and try to start the task
node = nodeSet.get(0);
if (createExecution(nodeSet, node, currentJob, internalTask, taskDescriptor)) {
numberOfTaskStarted++;
}
}
// if every task that should be launched have been removed
if (tasksToSchedule.isEmpty()) {
// get back unused nodes to the RManager
if (!nodeSet.isEmpty()) {
releaseNodes(currentJob, nodeSet);
freeResources.addAll(nodeSet.getAllNodesUrls());
}
// and leave the loop
break;
}
}
} catch (ActiveObjectCreationException e1) {
// Something goes wrong with the active object creation (createLauncher)
logger.warn("An exception occured while creating the task launcher.", e1);
// so try to get back every remaining nodes to the resource manager
try {
releaseNodes(currentJob, nodeSet);
freeResources.addAll(nodeSet.getAllNodesUrls());
} catch (Exception e2) {
logger.info("Unable to get back the nodeSet to the RM", e2);
}
if (--activeObjectCreationRetryTimeNumber == 0) {
break;
}
} catch (Exception e1) {
// if we are here, it is that something append while launching the current task.
logger.warn("An exception occured while starting task.", e1);
// so try to get back every remaining nodes to the resource manager
try {
releaseNodes(currentJob, nodeSet);
freeResources.addAll(nodeSet.getAllNodesUrls());
} catch (Exception e2) {
logger.info("Unable to get back the nodeSet to the RM", e2);
}
}
}
if (freeResources.isEmpty()) {
break;
}
if (activeObjectCreationRetryTimeNumber == 0) {
break;
}
}
return numberOfTaskStarted;
}
use of org.ow2.proactive.scheduler.job.InternalJob in project scheduling by ow2-proactive.
the class SchedulingMethodImpl method createExecution.
/**
* Create launcher and try to start the task.
*
* @param nodeSet the node set containing every available nodes that can be used for execution
* @param node the node on which to start the task
* @param job the job that owns the task to be started
* @param task the task to be started
* @param taskDescriptor the descriptor of the task to be started
*/
protected boolean createExecution(NodeSet nodeSet, Node node, InternalJob job, InternalTask task, TaskDescriptor taskDescriptor) throws Exception {
TaskLauncher launcher = null;
LiveJobs.JobData jobData = null;
try {
jobData = schedulingService.lockJob(job.getId());
// task is not paused
if (nodeSet.size() >= task.getNumberOfNodesNeeded() && (task.getStatus() != TaskStatus.PAUSED) && (jobData != null)) {
// start dataspace app for this job
DataSpaceServiceStarter dsStarter = schedulingService.getInfrastructure().getDataSpaceServiceStarter();
job.startDataSpaceApplication(dsStarter.getNamingService(), ImmutableList.of(task));
NodeSet nodes = new NodeSet();
try {
// create launcher
launcher = task.createLauncher(node);
activeObjectCreationRetryTimeNumber = ACTIVEOBJECT_CREATION_RETRY_TIME_NUMBER;
nodeSet.remove(0);
// we will need to update this code once topology will be allowed for single-node task
if (task.isParallel()) {
nodes = new NodeSet(nodeSet);
task.getExecuterInformation().addNodes(nodes);
nodeSet.clear();
}
// set nodes in the executable container
task.getExecutableContainer().setNodes(nodes);
tlogger.debug(task.getId(), "deploying");
// above 500 parent tasks, it is worth adjusting.
if (taskDescriptor.getParents().size() > 500) {
dotaskActionTimeout = (int) (taskDescriptor.getParents().size() / 500.0 * PASchedulerProperties.SCHEDULER_STARTTASK_TIMEOUT.getValueAsInt());
} else {
// reset the dotaskActionTimeout to its default value otherwise.
dotaskActionTimeout = PASchedulerProperties.SCHEDULER_STARTTASK_TIMEOUT.getValueAsInt();
}
boolean taskRecoverable = getRMProxiesManager().getRmProxy().areNodesRecoverable(nodes);
String terminateNotificationNodeURL = PAActiveObject.getActiveObjectNode(terminateNotification).getNodeInformation().getURL();
TaskRecoveryData taskRecoveryData = new TaskRecoveryData(terminateNotificationNodeURL, taskRecoverable);
threadPool.submitWithTimeout(new TimedDoTaskAction(job, taskDescriptor, launcher, schedulingService, terminateNotification, corePrivateKey, taskRecoveryData), dotaskActionTimeout, TimeUnit.MILLISECONDS);
// we advertise here that the task is started, however
// this is not entirely true: the only thing we are sure
// about at this point is that we submitted to the thread
// pool the action that will call the "doTask" of the task
// launcher. There is thus a small gap here where the task
// is seen as started whereas it is not yet started. We
// cannot easily move the task started notification because
// 1) it makes the job lock acquisition less predictable
// (because the TimeDoTaskAction will have to compete with
// the SchedulingMethodImpl)
// and more importantly 2) the
// SchedulingMethodImpl#createExecution may happen to be
// called a second time for the task that is currently being
// started by the TimedDoTaskAction.
finalizeStarting(job, task, node, launcher);
return true;
} catch (Exception t) {
try {
// if there was a problem, free nodeSet for multi-nodes task
nodes.add(node);
releaseNodes(job, nodes);
} catch (Throwable ni) {
// miam miam
}
throw t;
}
} else {
return false;
}
} finally {
if (jobData != null) {
jobData.unlock();
}
}
}
Aggregations