use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.
the class TaskExecutorManager method cancelTask.
/**
* Cancels the given task.
*
* @param jobId the job id
* @param taskId the task id
*/
public synchronized void cancelTask(long jobId, long taskId) {
Pair<Long, Long> id = new Pair<>(jobId, taskId);
TaskInfo taskInfo = mUnfinishedTasks.get(id);
if (!mTaskFutures.containsKey(id) || taskInfo.getStatus().equals(Status.CANCELED)) {
// job has finished, or failed, or canceled
return;
}
Future<?> future = mTaskFutures.get(id);
if (!future.cancel(true)) {
taskInfo.setStatus(Status.FAILED);
taskInfo.setErrorType("FailedCancel");
taskInfo.setErrorMessage("Failed to cancel the task");
LOG.info("Failed to cancel task {} for job {}", taskId, jobId);
} else {
taskInfo.setStatus(Status.CANCELED);
LOG.info("Task {} for job {} canceled", taskId, jobId);
}
finishTask(id);
}
use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.
the class JobMaster method workerHeartbeat.
/**
* Updates the tasks' status when a worker periodically heartbeats with the master, and sends the
* commands for the worker to execute.
*
* @param jobWorkerHealth the job worker health info
* @param taskInfoList the list of the task information
* @return the list of {@link JobCommand} to the worker
*/
public List<JobCommand> workerHeartbeat(JobWorkerHealth jobWorkerHealth, List<TaskInfo> taskInfoList) throws ResourceExhaustedException {
long workerId = jobWorkerHealth.getWorkerId();
String hostname;
// Run under shared lock for mWorkers
try (LockResource workersLockShared = new LockResource(mWorkerRWLock.readLock())) {
MasterWorkerInfo worker = mWorkers.getFirstByField(mIdIndex, workerId);
if (worker == null) {
return Collections.singletonList(JobCommand.newBuilder().setRegisterCommand(RegisterCommand.getDefaultInstance()).build());
}
hostname = worker.getWorkerAddress().getHost();
// Update last-update-time of this particular worker under lock
// to prevent lost worker detector clearing it under race
worker.updateLastUpdatedTimeMs();
}
mWorkerHealth.put(workerId, jobWorkerHealth);
// Update task infos for all jobs involved
Map<Long, List<TaskInfo>> taskInfosPerJob = new HashMap<>();
for (TaskInfo taskInfo : taskInfoList) {
taskInfo.setWorkerHost(hostname);
if (!taskInfosPerJob.containsKey(taskInfo.getJobId())) {
taskInfosPerJob.put(taskInfo.getJobId(), new ArrayList());
}
taskInfosPerJob.get(taskInfo.getJobId()).add(taskInfo);
}
for (Map.Entry<Long, List<TaskInfo>> taskInfosPair : taskInfosPerJob.entrySet()) {
PlanCoordinator planCoordinator = mPlanTracker.getCoordinator(taskInfosPair.getKey());
if (planCoordinator != null) {
planCoordinator.updateTasks(taskInfosPair.getValue());
}
}
return mCommandManager.pollAllPendingCommands(workerId);
}
use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.
the class PlanCoordinator method failTasksForWorker.
/**
* Fails any incomplete tasks being run on the specified worker.
*
* @param workerId the id of the worker to fail tasks for
*/
public void failTasksForWorker(long workerId) {
synchronized (mPlanInfo) {
if (mPlanInfo.getStatus().isFinished()) {
return;
}
List<Long> taskIds = mWorkerIdToTaskIds.get(workerId);
if (taskIds == null) {
return;
}
boolean statusChanged = false;
for (Long taskId : taskIds) {
TaskInfo taskInfo = mPlanInfo.getTaskInfo(taskId);
if (taskInfo == null || taskInfo.getStatus().isFinished()) {
continue;
}
taskInfo.setStatus(Status.FAILED);
taskInfo.setErrorType("JobWorkerLost");
taskInfo.setErrorMessage(String.format("Job worker(%s) was lost before " + "the task(%d) could complete", taskInfo.getWorkerHost(), taskId));
statusChanged = true;
break;
}
if (statusChanged) {
updateStatus();
}
}
}
use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.
the class PlanCoordinator method updateStatus.
/**
* Updates the status of the job. When all the tasks are completed, run the join method in the
* definition.
*/
private synchronized void updateStatus() {
int completed = 0;
List<TaskInfo> taskInfoList = mPlanInfo.getTaskInfoList();
JobConfig config = mPlanInfo.getJobConfig();
Preconditions.checkNotNull(config);
FileSystem fileSystem = mJobServerContext.getFileSystem();
for (TaskInfo info : taskInfoList) {
Status status = info.getStatus();
switch(status) {
case FAILED:
setJobAsFailed(info.getErrorType(), "Task execution failed: " + info.getErrorMessage());
return;
case CANCELED:
if (mPlanInfo.getStatus() != Status.FAILED) {
mPlanInfo.setStatus(Status.CANCELED);
DistributedCmdMetrics.incrementForAllConfigsCancelStatus(config);
}
return;
case RUNNING:
if (mPlanInfo.getStatus() != Status.FAILED && mPlanInfo.getStatus() != Status.CANCELED) {
mPlanInfo.setStatus(Status.RUNNING);
}
break;
case COMPLETED:
completed++;
break;
case CREATED:
// do nothing
break;
default:
throw new IllegalArgumentException("Unsupported status " + info.getStatus());
}
}
if (completed == taskInfoList.size()) {
if (mPlanInfo.getStatus() == Status.COMPLETED) {
return;
}
// all the tasks completed, run join
try {
// Try to join first, so that in case of failure we don't move to a completed state yet
mPlanInfo.setResult(join(taskInfoList));
mPlanInfo.setStatus(Status.COMPLETED);
// Increment the counter for Complete status when all the tasks in a job are completed.
DistributedCmdMetrics.incrementForAllConfigsCompleteStatus(config, fileSystem, new CountingRetry(5));
} catch (Exception e) {
LOG.warn("Job error when joining tasks Job Id={} Config={}", mPlanInfo.getId(), mPlanInfo.getJobConfig(), e);
setJobAsFailed(ErrorUtils.getErrorType(e), e.getMessage());
}
}
}
use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.
the class StatCommand method formatOutput.
private String formatOutput(CommandLine cl, JobInfo info) {
StringBuilder output = new StringBuilder();
output.append("ID: ").append(info.getId()).append("\n");
output.append("Name: ").append(info.getName()).append("\n");
output.append("Description: ");
if (cl.hasOption("v")) {
output.append(info.getDescription());
} else {
output.append(StringUtils.abbreviate(info.getDescription(), 200));
}
output.append("\n");
output.append("Status: ").append(info.getStatus()).append("\n");
if (info.getErrorMessage() != null && !info.getErrorMessage().isEmpty()) {
output.append("Error: ").append(info.getErrorMessage()).append("\n");
}
if (info.getResult() != null && !info.getResult().toString().isEmpty()) {
output.append("Result: ").append(info.getResult().toString()).append("\n");
}
if (cl.hasOption("v")) {
for (JobInfo childInfo : info.getChildren()) {
output.append("Task ").append(childInfo.getId()).append("\n");
if (childInfo instanceof TaskInfo) {
TaskInfo taskInfo = (TaskInfo) childInfo;
if (taskInfo.getWorkerHost() != null) {
output.append("\t").append("Worker: ").append(taskInfo.getWorkerHost()).append("\n");
}
}
if (!childInfo.getDescription().isEmpty()) {
output.append("\t").append("Description: ").append(StringUtils.abbreviate(childInfo.getDescription(), 200)).append("\n");
}
output.append("\t").append("Status: ").append(childInfo.getStatus()).append("\n");
if (childInfo.getErrorMessage() != null && !childInfo.getErrorMessage().isEmpty()) {
output.append("\t").append("Error: ").append(childInfo.getErrorMessage()).append("\n");
}
if (childInfo.getResult() != null) {
output.append("\t").append("Result: ").append(childInfo.getResult()).append("\n");
}
}
}
return output.toString();
}
Aggregations