Search in sources :

Example 11 with TaskInfo

use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.

the class TaskExecutorManager method cancelTask.

/**
 * Cancels the given task.
 *
 * @param jobId the job id
 * @param taskId the task id
 */
public synchronized void cancelTask(long jobId, long taskId) {
    Pair<Long, Long> id = new Pair<>(jobId, taskId);
    TaskInfo taskInfo = mUnfinishedTasks.get(id);
    if (!mTaskFutures.containsKey(id) || taskInfo.getStatus().equals(Status.CANCELED)) {
        // job has finished, or failed, or canceled
        return;
    }
    Future<?> future = mTaskFutures.get(id);
    if (!future.cancel(true)) {
        taskInfo.setStatus(Status.FAILED);
        taskInfo.setErrorType("FailedCancel");
        taskInfo.setErrorMessage("Failed to cancel the task");
        LOG.info("Failed to cancel task {} for job {}", taskId, jobId);
    } else {
        taskInfo.setStatus(Status.CANCELED);
        LOG.info("Task {} for job {} canceled", taskId, jobId);
    }
    finishTask(id);
}
Also used : TaskInfo(alluxio.job.wire.TaskInfo) Pair(alluxio.collections.Pair)

Example 12 with TaskInfo

use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.

the class JobMaster method workerHeartbeat.

/**
 * Updates the tasks' status when a worker periodically heartbeats with the master, and sends the
 * commands for the worker to execute.
 *
 * @param jobWorkerHealth the job worker health info
 * @param taskInfoList the list of the task information
 * @return the list of {@link JobCommand} to the worker
 */
public List<JobCommand> workerHeartbeat(JobWorkerHealth jobWorkerHealth, List<TaskInfo> taskInfoList) throws ResourceExhaustedException {
    long workerId = jobWorkerHealth.getWorkerId();
    String hostname;
    // Run under shared lock for mWorkers
    try (LockResource workersLockShared = new LockResource(mWorkerRWLock.readLock())) {
        MasterWorkerInfo worker = mWorkers.getFirstByField(mIdIndex, workerId);
        if (worker == null) {
            return Collections.singletonList(JobCommand.newBuilder().setRegisterCommand(RegisterCommand.getDefaultInstance()).build());
        }
        hostname = worker.getWorkerAddress().getHost();
        // Update last-update-time of this particular worker under lock
        // to prevent lost worker detector clearing it under race
        worker.updateLastUpdatedTimeMs();
    }
    mWorkerHealth.put(workerId, jobWorkerHealth);
    // Update task infos for all jobs involved
    Map<Long, List<TaskInfo>> taskInfosPerJob = new HashMap<>();
    for (TaskInfo taskInfo : taskInfoList) {
        taskInfo.setWorkerHost(hostname);
        if (!taskInfosPerJob.containsKey(taskInfo.getJobId())) {
            taskInfosPerJob.put(taskInfo.getJobId(), new ArrayList());
        }
        taskInfosPerJob.get(taskInfo.getJobId()).add(taskInfo);
    }
    for (Map.Entry<Long, List<TaskInfo>> taskInfosPair : taskInfosPerJob.entrySet()) {
        PlanCoordinator planCoordinator = mPlanTracker.getCoordinator(taskInfosPair.getKey());
        if (planCoordinator != null) {
            planCoordinator.updateTasks(taskInfosPair.getValue());
        }
    }
    return mCommandManager.pollAllPendingCommands(workerId);
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TaskInfo(alluxio.job.wire.TaskInfo) LockResource(alluxio.resource.LockResource) MasterWorkerInfo(alluxio.job.MasterWorkerInfo) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) PlanCoordinator(alluxio.master.job.plan.PlanCoordinator)

Example 13 with TaskInfo

use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.

the class PlanCoordinator method failTasksForWorker.

/**
 * Fails any incomplete tasks being run on the specified worker.
 *
 * @param workerId the id of the worker to fail tasks for
 */
public void failTasksForWorker(long workerId) {
    synchronized (mPlanInfo) {
        if (mPlanInfo.getStatus().isFinished()) {
            return;
        }
        List<Long> taskIds = mWorkerIdToTaskIds.get(workerId);
        if (taskIds == null) {
            return;
        }
        boolean statusChanged = false;
        for (Long taskId : taskIds) {
            TaskInfo taskInfo = mPlanInfo.getTaskInfo(taskId);
            if (taskInfo == null || taskInfo.getStatus().isFinished()) {
                continue;
            }
            taskInfo.setStatus(Status.FAILED);
            taskInfo.setErrorType("JobWorkerLost");
            taskInfo.setErrorMessage(String.format("Job worker(%s) was lost before " + "the task(%d) could complete", taskInfo.getWorkerHost(), taskId));
            statusChanged = true;
            break;
        }
        if (statusChanged) {
            updateStatus();
        }
    }
}
Also used : TaskInfo(alluxio.job.wire.TaskInfo)

Example 14 with TaskInfo

use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.

the class PlanCoordinator method updateStatus.

/**
 * Updates the status of the job. When all the tasks are completed, run the join method in the
 * definition.
 */
private synchronized void updateStatus() {
    int completed = 0;
    List<TaskInfo> taskInfoList = mPlanInfo.getTaskInfoList();
    JobConfig config = mPlanInfo.getJobConfig();
    Preconditions.checkNotNull(config);
    FileSystem fileSystem = mJobServerContext.getFileSystem();
    for (TaskInfo info : taskInfoList) {
        Status status = info.getStatus();
        switch(status) {
            case FAILED:
                setJobAsFailed(info.getErrorType(), "Task execution failed: " + info.getErrorMessage());
                return;
            case CANCELED:
                if (mPlanInfo.getStatus() != Status.FAILED) {
                    mPlanInfo.setStatus(Status.CANCELED);
                    DistributedCmdMetrics.incrementForAllConfigsCancelStatus(config);
                }
                return;
            case RUNNING:
                if (mPlanInfo.getStatus() != Status.FAILED && mPlanInfo.getStatus() != Status.CANCELED) {
                    mPlanInfo.setStatus(Status.RUNNING);
                }
                break;
            case COMPLETED:
                completed++;
                break;
            case CREATED:
                // do nothing
                break;
            default:
                throw new IllegalArgumentException("Unsupported status " + info.getStatus());
        }
    }
    if (completed == taskInfoList.size()) {
        if (mPlanInfo.getStatus() == Status.COMPLETED) {
            return;
        }
        // all the tasks completed, run join
        try {
            // Try to join first, so that in case of failure we don't move to a completed state yet
            mPlanInfo.setResult(join(taskInfoList));
            mPlanInfo.setStatus(Status.COMPLETED);
            // Increment the counter for Complete status when all the tasks in a job are completed.
            DistributedCmdMetrics.incrementForAllConfigsCompleteStatus(config, fileSystem, new CountingRetry(5));
        } catch (Exception e) {
            LOG.warn("Job error when joining tasks Job Id={} Config={}", mPlanInfo.getId(), mPlanInfo.getJobConfig(), e);
            setJobAsFailed(ErrorUtils.getErrorType(e), e.getMessage());
        }
    }
}
Also used : TaskInfo(alluxio.job.wire.TaskInfo) Status(alluxio.job.wire.Status) CountingRetry(alluxio.retry.CountingRetry) FileSystem(alluxio.client.file.FileSystem) BatchedJobConfig(alluxio.job.plan.BatchedJobConfig) JobConfig(alluxio.job.JobConfig) JobDoesNotExistException(alluxio.exception.JobDoesNotExistException)

Example 15 with TaskInfo

use of alluxio.job.wire.TaskInfo in project alluxio by Alluxio.

the class StatCommand method formatOutput.

private String formatOutput(CommandLine cl, JobInfo info) {
    StringBuilder output = new StringBuilder();
    output.append("ID: ").append(info.getId()).append("\n");
    output.append("Name: ").append(info.getName()).append("\n");
    output.append("Description: ");
    if (cl.hasOption("v")) {
        output.append(info.getDescription());
    } else {
        output.append(StringUtils.abbreviate(info.getDescription(), 200));
    }
    output.append("\n");
    output.append("Status: ").append(info.getStatus()).append("\n");
    if (info.getErrorMessage() != null && !info.getErrorMessage().isEmpty()) {
        output.append("Error: ").append(info.getErrorMessage()).append("\n");
    }
    if (info.getResult() != null && !info.getResult().toString().isEmpty()) {
        output.append("Result: ").append(info.getResult().toString()).append("\n");
    }
    if (cl.hasOption("v")) {
        for (JobInfo childInfo : info.getChildren()) {
            output.append("Task ").append(childInfo.getId()).append("\n");
            if (childInfo instanceof TaskInfo) {
                TaskInfo taskInfo = (TaskInfo) childInfo;
                if (taskInfo.getWorkerHost() != null) {
                    output.append("\t").append("Worker: ").append(taskInfo.getWorkerHost()).append("\n");
                }
            }
            if (!childInfo.getDescription().isEmpty()) {
                output.append("\t").append("Description: ").append(StringUtils.abbreviate(childInfo.getDescription(), 200)).append("\n");
            }
            output.append("\t").append("Status: ").append(childInfo.getStatus()).append("\n");
            if (childInfo.getErrorMessage() != null && !childInfo.getErrorMessage().isEmpty()) {
                output.append("\t").append("Error: ").append(childInfo.getErrorMessage()).append("\n");
            }
            if (childInfo.getResult() != null) {
                output.append("\t").append("Result: ").append(childInfo.getResult()).append("\n");
            }
        }
    }
    return output.toString();
}
Also used : TaskInfo(alluxio.job.wire.TaskInfo) JobInfo(alluxio.job.wire.JobInfo)

Aggregations

TaskInfo (alluxio.job.wire.TaskInfo)15 Pair (alluxio.collections.Pair)5 JobConfig (alluxio.job.JobConfig)2 BatchedJobConfig (alluxio.job.plan.BatchedJobConfig)2 Status (alluxio.job.wire.Status)2 ArrayList (java.util.ArrayList)2 FileSystem (alluxio.client.file.FileSystem)1 AlluxioException (alluxio.exception.AlluxioException)1 JobDoesNotExistException (alluxio.exception.JobDoesNotExistException)1 JobCommand (alluxio.grpc.JobCommand)1 JobInfo (alluxio.grpc.JobInfo)1 MasterWorkerInfo (alluxio.job.MasterWorkerInfo)1 JobInfo (alluxio.job.wire.JobInfo)1 JobWorkerHealth (alluxio.job.wire.JobWorkerHealth)1 PlanCoordinator (alluxio.master.job.plan.PlanCoordinator)1 LockResource (alluxio.resource.LockResource)1 CountingRetry (alluxio.retry.CountingRetry)1 WorkerInfo (alluxio.wire.WorkerInfo)1 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1