Search in sources :

Example 21 with Task

use of org.apache.hadoop.mapreduce.v2.app.job.Task in project hadoop by apache.

the class TaskImpl method recover.

/**
   * Recover a completed task from a previous application attempt
   * @param taskInfo recovered info about the task
   * @param recoverTaskOutput whether to recover task outputs
   * @return state of the task after recovery
   */
private TaskStateInternal recover(TaskInfo taskInfo, OutputCommitter committer, boolean recoverTaskOutput) {
    LOG.info("Recovering task " + taskId + " from prior app attempt, status was " + taskInfo.getTaskStatus());
    scheduledTime = taskInfo.getStartTime();
    sendTaskStartedEvent();
    Collection<TaskAttemptInfo> attemptInfos = taskInfo.getAllTaskAttempts().values();
    if (attemptInfos.size() > 0) {
        metrics.launchedTask(this);
    }
    // recover the attempts for this task in the order they finished
    // so task attempt completion events are ordered properly
    int savedNextAttemptNumber = nextAttemptNumber;
    ArrayList<TaskAttemptInfo> taInfos = new ArrayList<TaskAttemptInfo>(taskInfo.getAllTaskAttempts().values());
    Collections.sort(taInfos, TA_INFO_COMPARATOR);
    for (TaskAttemptInfo taInfo : taInfos) {
        nextAttemptNumber = taInfo.getAttemptId().getId();
        TaskAttemptImpl attempt = addAttempt(Avataar.VIRGIN);
        // handle the recovery inline so attempts complete before task does
        attempt.handle(new TaskAttemptRecoverEvent(attempt.getID(), taInfo, committer, recoverTaskOutput));
        finishedAttempts.add(attempt.getID());
        TaskAttemptCompletionEventStatus taces = null;
        TaskAttemptState attemptState = attempt.getState();
        switch(attemptState) {
            case FAILED:
                taces = TaskAttemptCompletionEventStatus.FAILED;
                break;
            case KILLED:
                taces = TaskAttemptCompletionEventStatus.KILLED;
                break;
            case SUCCEEDED:
                taces = TaskAttemptCompletionEventStatus.SUCCEEDED;
                break;
            default:
                throw new IllegalStateException("Unexpected attempt state during recovery: " + attemptState);
        }
        if (attemptState == TaskAttemptState.FAILED) {
            failedAttempts.add(attempt.getID());
            if (failedAttempts.size() >= maxAttempts) {
                taces = TaskAttemptCompletionEventStatus.TIPFAILED;
            }
        }
        // TODO: this shouldn't be necessary after MAPREDUCE-4330
        if (successfulAttempt == null) {
            handleTaskAttemptCompletion(attempt.getID(), taces);
            if (attemptState == TaskAttemptState.SUCCEEDED) {
                successfulAttempt = attempt.getID();
            }
        }
    }
    nextAttemptNumber = savedNextAttemptNumber;
    TaskStateInternal taskState = TaskStateInternal.valueOf(taskInfo.getTaskStatus());
    switch(taskState) {
        case SUCCEEDED:
            if (successfulAttempt != null) {
                sendTaskSucceededEvents();
            } else {
                LOG.info("Missing successful attempt for task " + taskId + ", recovering as RUNNING");
                // there must have been a fetch failure and the retry wasn't complete
                taskState = TaskStateInternal.RUNNING;
                metrics.runningTask(this);
                addAndScheduleAttempt(Avataar.VIRGIN);
            }
            break;
        case FAILED:
        case KILLED:
            {
                if (taskState == TaskStateInternal.KILLED && attemptInfos.size() == 0) {
                    metrics.endWaitingTask(this);
                }
                TaskFailedEvent tfe = new TaskFailedEvent(taskInfo.getTaskId(), taskInfo.getFinishTime(), taskInfo.getTaskType(), taskInfo.getError(), taskInfo.getTaskStatus(), taskInfo.getFailedDueToAttemptId(), taskInfo.getCounters());
                eventHandler.handle(new JobHistoryEvent(taskId.getJobId(), tfe));
                eventHandler.handle(new JobTaskEvent(taskId, getExternalState(taskState)));
                break;
            }
        default:
            throw new java.lang.AssertionError("Unexpected recovered task state: " + taskState);
    }
    return taskState;
}
Also used : TaskStateInternal(org.apache.hadoop.mapreduce.v2.app.job.TaskStateInternal) ArrayList(java.util.ArrayList) TaskAttemptCompletionEventStatus(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEventStatus) JobHistoryEvent(org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent) TaskAttemptState(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState) JobTaskEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent) TaskAttemptInfo(org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo) TaskFailedEvent(org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent) TaskAttemptRecoverEvent(org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptRecoverEvent)

Example 22 with Task

use of org.apache.hadoop.mapreduce.v2.app.job.Task in project hadoop by apache.

the class TaskAttemptImpl method recover.

@SuppressWarnings("unchecked")
public TaskAttemptStateInternal recover(TaskAttemptInfo taInfo, OutputCommitter committer, boolean recoverOutput) {
    ContainerId containerId = taInfo.getContainerId();
    NodeId containerNodeId = NodeId.fromString(taInfo.getHostname() + ":" + taInfo.getPort());
    String nodeHttpAddress = StringInterner.weakIntern(taInfo.getHostname() + ":" + taInfo.getHttpPort());
    // Resource/Priority/Tokens are only needed while launching the container on
    // an NM, these are already completed tasks, so setting them to null
    container = Container.newInstance(containerId, containerNodeId, nodeHttpAddress, null, null, null);
    computeRackAndLocality();
    launchTime = taInfo.getStartTime();
    finishTime = (taInfo.getFinishTime() != -1) ? taInfo.getFinishTime() : clock.getTime();
    shufflePort = taInfo.getShufflePort();
    trackerName = taInfo.getHostname();
    httpPort = taInfo.getHttpPort();
    sendLaunchedEvents();
    reportedStatus.id = attemptId;
    reportedStatus.progress = 1.0f;
    reportedStatus.counters = taInfo.getCounters();
    reportedStatus.stateString = taInfo.getState();
    reportedStatus.phase = Phase.CLEANUP;
    reportedStatus.mapFinishTime = taInfo.getMapFinishTime();
    reportedStatus.shuffleFinishTime = taInfo.getShuffleFinishTime();
    reportedStatus.sortFinishTime = taInfo.getSortFinishTime();
    addDiagnosticInfo(taInfo.getError());
    boolean needToClean = false;
    String recoveredState = taInfo.getTaskStatus();
    if (recoverOutput && TaskAttemptState.SUCCEEDED.toString().equals(recoveredState)) {
        TaskAttemptContext tac = new TaskAttemptContextImpl(conf, TypeConverter.fromYarn(attemptId));
        try {
            committer.recoverTask(tac);
            LOG.info("Recovered output from task attempt " + attemptId);
        } catch (Exception e) {
            LOG.error("Unable to recover task attempt " + attemptId, e);
            LOG.info("Task attempt " + attemptId + " will be recovered as KILLED");
            recoveredState = TaskAttemptState.KILLED.toString();
            needToClean = true;
        }
    }
    TaskAttemptStateInternal attemptState;
    if (TaskAttemptState.SUCCEEDED.toString().equals(recoveredState)) {
        attemptState = TaskAttemptStateInternal.SUCCEEDED;
        reportedStatus.taskState = TaskAttemptState.SUCCEEDED;
        eventHandler.handle(createJobCounterUpdateEventTASucceeded(this));
        logAttemptFinishedEvent(attemptState);
    } else if (TaskAttemptState.FAILED.toString().equals(recoveredState)) {
        attemptState = TaskAttemptStateInternal.FAILED;
        reportedStatus.taskState = TaskAttemptState.FAILED;
        eventHandler.handle(createJobCounterUpdateEventTAFailed(this, false));
        TaskAttemptUnsuccessfulCompletionEvent tauce = createTaskAttemptUnsuccessfulCompletionEvent(this, TaskAttemptStateInternal.FAILED);
        eventHandler.handle(new JobHistoryEvent(attemptId.getTaskId().getJobId(), tauce));
    } else {
        if (!TaskAttemptState.KILLED.toString().equals(recoveredState)) {
            if (String.valueOf(recoveredState).isEmpty()) {
                LOG.info("TaskAttempt" + attemptId + " had not completed, recovering as KILLED");
            } else {
                LOG.warn("TaskAttempt " + attemptId + " found in unexpected state " + recoveredState + ", recovering as KILLED");
            }
            addDiagnosticInfo("Killed during application recovery");
            needToClean = true;
        }
        attemptState = TaskAttemptStateInternal.KILLED;
        reportedStatus.taskState = TaskAttemptState.KILLED;
        eventHandler.handle(createJobCounterUpdateEventTAKilled(this, false));
        TaskAttemptUnsuccessfulCompletionEvent tauce = createTaskAttemptUnsuccessfulCompletionEvent(this, TaskAttemptStateInternal.KILLED);
        eventHandler.handle(new JobHistoryEvent(attemptId.getTaskId().getJobId(), tauce));
    }
    if (needToClean) {
        TaskAttemptContext tac = new TaskAttemptContextImpl(conf, TypeConverter.fromYarn(attemptId));
        try {
            committer.abortTask(tac);
        } catch (Exception e) {
            LOG.warn("Task cleanup failed for attempt " + attemptId, e);
        }
    }
    return attemptState;
}
Also used : TaskAttemptStateInternal(org.apache.hadoop.mapreduce.v2.app.job.TaskAttemptStateInternal) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TaskAttemptContextImpl(org.apache.hadoop.mapred.TaskAttemptContextImpl) NodeId(org.apache.hadoop.yarn.api.records.NodeId) JobHistoryEvent(org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) InvalidStateTransitionException(org.apache.hadoop.yarn.state.InvalidStateTransitionException) TaskAttemptUnsuccessfulCompletionEvent(org.apache.hadoop.mapreduce.jobhistory.TaskAttemptUnsuccessfulCompletionEvent)

Example 23 with Task

use of org.apache.hadoop.mapreduce.v2.app.job.Task in project hadoop by apache.

the class CheckpointAMPreemptionPolicy method preempt.

@Override
public void preempt(Context ctxt, PreemptionMessage preemptionRequests) {
    if (preemptionRequests != null) {
        // handling non-negotiable preemption
        StrictPreemptionContract cStrict = preemptionRequests.getStrictContract();
        if (cStrict != null && cStrict.getContainers() != null && cStrict.getContainers().size() > 0) {
            LOG.info("strict preemption :" + preemptionRequests.getStrictContract().getContainers().size() + " containers to kill");
            // handle strict preemptions. These containers are non-negotiable
            for (PreemptionContainer c : preemptionRequests.getStrictContract().getContainers()) {
                ContainerId reqCont = c.getId();
                TaskAttemptId reqTask = ctxt.getTaskAttempt(reqCont);
                if (reqTask != null) {
                    // ignore requests for preempting containers running maps
                    if (org.apache.hadoop.mapreduce.v2.api.records.TaskType.REDUCE.equals(reqTask.getTaskId().getTaskType())) {
                        toBePreempted.add(reqTask);
                        LOG.info("preempting " + reqCont + " running task:" + reqTask);
                    } else {
                        LOG.info("NOT preempting " + reqCont + " running task:" + reqTask);
                    }
                }
            }
        }
        // handling negotiable preemption
        PreemptionContract cNegot = preemptionRequests.getContract();
        if (cNegot != null && cNegot.getResourceRequest() != null && cNegot.getResourceRequest().size() > 0 && cNegot.getContainers() != null && cNegot.getContainers().size() > 0) {
            LOG.info("negotiable preemption :" + preemptionRequests.getContract().getResourceRequest().size() + " resourceReq, " + preemptionRequests.getContract().getContainers().size() + " containers");
            // handle fungible preemption. Here we only look at the total amount of
            // resources to be preempted and pick enough of our containers to
            // satisfy that. We only support checkpointing for reducers for now.
            List<PreemptionResourceRequest> reqResources = preemptionRequests.getContract().getResourceRequest();
            // compute the total amount of pending preemptions (to be discounted
            // from current request)
            int pendingPreemptionRam = 0;
            int pendingPreemptionCores = 0;
            for (Resource r : pendingFlexiblePreemptions.values()) {
                pendingPreemptionRam += r.getMemorySize();
                pendingPreemptionCores += r.getVirtualCores();
            }
            // discount preemption request based on currently pending preemption
            for (PreemptionResourceRequest rr : reqResources) {
                ResourceRequest reqRsrc = rr.getResourceRequest();
                if (!ResourceRequest.ANY.equals(reqRsrc.getResourceName())) {
                    // For now, only respond to aggregate requests and ignore locality
                    continue;
                }
                LOG.info("ResourceRequest:" + reqRsrc);
                int reqCont = reqRsrc.getNumContainers();
                long reqMem = reqRsrc.getCapability().getMemorySize();
                long totalMemoryToRelease = reqCont * reqMem;
                int reqCores = reqRsrc.getCapability().getVirtualCores();
                int totalCoresToRelease = reqCont * reqCores;
                // remove
                if (pendingPreemptionRam > 0) {
                    // if goes negative we simply exit
                    totalMemoryToRelease -= pendingPreemptionRam;
                    // decrement pending resources if zero or negatve we will
                    // ignore it while processing next PreemptionResourceRequest
                    pendingPreemptionRam -= totalMemoryToRelease;
                }
                if (pendingPreemptionCores > 0) {
                    totalCoresToRelease -= pendingPreemptionCores;
                    pendingPreemptionCores -= totalCoresToRelease;
                }
                // reverse order of allocation (for now)
                List<Container> listOfCont = ctxt.getContainers(TaskType.REDUCE);
                Collections.sort(listOfCont, new Comparator<Container>() {

                    @Override
                    public int compare(final Container o1, final Container o2) {
                        return o2.getId().compareTo(o1.getId());
                    }
                });
                // preempt reducers first
                for (Container cont : listOfCont) {
                    if (totalMemoryToRelease <= 0 && totalCoresToRelease <= 0) {
                        break;
                    }
                    TaskAttemptId reduceId = ctxt.getTaskAttempt(cont.getId());
                    int cMem = (int) cont.getResource().getMemorySize();
                    int cCores = cont.getResource().getVirtualCores();
                    if (!toBePreempted.contains(reduceId)) {
                        totalMemoryToRelease -= cMem;
                        totalCoresToRelease -= cCores;
                        toBePreempted.add(reduceId);
                        pendingFlexiblePreemptions.put(reduceId, cont.getResource());
                    }
                    LOG.info("ResourceRequest:" + reqRsrc + " satisfied preempting " + reduceId);
                }
            // if map was preemptable we would do add them to toBePreempted here
            }
        }
    }
}
Also used : TaskAttemptId(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId) Resource(org.apache.hadoop.yarn.api.records.Resource) PreemptionContainer(org.apache.hadoop.yarn.api.records.PreemptionContainer) PreemptionResourceRequest(org.apache.hadoop.yarn.api.records.PreemptionResourceRequest) StrictPreemptionContract(org.apache.hadoop.yarn.api.records.StrictPreemptionContract) PreemptionContainer(org.apache.hadoop.yarn.api.records.PreemptionContainer) Container(org.apache.hadoop.yarn.api.records.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) PreemptionResourceRequest(org.apache.hadoop.yarn.api.records.PreemptionResourceRequest) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) PreemptionContract(org.apache.hadoop.yarn.api.records.PreemptionContract) StrictPreemptionContract(org.apache.hadoop.yarn.api.records.StrictPreemptionContract)

Example 24 with Task

use of org.apache.hadoop.mapreduce.v2.app.job.Task in project hadoop by apache.

the class DefaultSpeculator method maybeScheduleASpeculation.

private int maybeScheduleASpeculation(TaskType type) {
    int successes = 0;
    long now = clock.getTime();
    ConcurrentMap<JobId, AtomicInteger> containerNeeds = type == TaskType.MAP ? mapContainerNeeds : reduceContainerNeeds;
    for (ConcurrentMap.Entry<JobId, AtomicInteger> jobEntry : containerNeeds.entrySet()) {
        //  container prematurely.
        if (jobEntry.getValue().get() > 0) {
            continue;
        }
        int numberSpeculationsAlready = 0;
        int numberRunningTasks = 0;
        // loop through the tasks of the kind
        Job job = context.getJob(jobEntry.getKey());
        Map<TaskId, Task> tasks = job.getTasks(type);
        int numberAllowedSpeculativeTasks = (int) Math.max(minimumAllowedSpeculativeTasks, proportionTotalTasksSpeculatable * tasks.size());
        TaskId bestTaskID = null;
        long bestSpeculationValue = -1L;
        // TODO track the tasks that are potentially worth looking at
        for (Map.Entry<TaskId, Task> taskEntry : tasks.entrySet()) {
            long mySpeculationValue = speculationValue(taskEntry.getKey(), now);
            if (mySpeculationValue == ALREADY_SPECULATING) {
                ++numberSpeculationsAlready;
            }
            if (mySpeculationValue != NOT_RUNNING) {
                ++numberRunningTasks;
            }
            if (mySpeculationValue > bestSpeculationValue) {
                bestTaskID = taskEntry.getKey();
                bestSpeculationValue = mySpeculationValue;
            }
        }
        numberAllowedSpeculativeTasks = (int) Math.max(numberAllowedSpeculativeTasks, proportionRunningTasksSpeculatable * numberRunningTasks);
        // If we found a speculation target, fire it off
        if (bestTaskID != null && numberAllowedSpeculativeTasks > numberSpeculationsAlready) {
            addSpeculativeAttempt(bestTaskID);
            ++successes;
        }
    }
    return successes;
}
Also used : Task(org.apache.hadoop.mapreduce.v2.app.job.Task) TaskId(org.apache.hadoop.mapreduce.v2.api.records.TaskId) ConcurrentMap(java.util.concurrent.ConcurrentMap) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Job(org.apache.hadoop.mapreduce.v2.app.job.Job) ConcurrentMap(java.util.concurrent.ConcurrentMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId)

Example 25 with Task

use of org.apache.hadoop.mapreduce.v2.app.job.Task in project hadoop by apache.

the class DefaultSpeculator method statusUpdate.

/**
   * Absorbs one TaskAttemptStatus
   *
   * @param reportedStatus the status report that we got from a task attempt
   *        that we want to fold into the speculation data for this job
   * @param timestamp the time this status corresponds to.  This matters
   *        because statuses contain progress.
   */
protected void statusUpdate(TaskAttemptStatus reportedStatus, long timestamp) {
    String stateString = reportedStatus.taskState.toString();
    TaskAttemptId attemptID = reportedStatus.id;
    TaskId taskID = attemptID.getTaskId();
    Job job = context.getJob(taskID.getJobId());
    if (job == null) {
        return;
    }
    Task task = job.getTask(taskID);
    if (task == null) {
        return;
    }
    estimator.updateAttempt(reportedStatus, timestamp);
    if (stateString.equals(TaskAttemptState.RUNNING.name())) {
        runningTasks.putIfAbsent(taskID, Boolean.TRUE);
    } else {
        runningTasks.remove(taskID, Boolean.TRUE);
        if (!stateString.equals(TaskAttemptState.STARTING.name())) {
            runningTaskAttemptStatistics.remove(attemptID);
        }
    }
}
Also used : Task(org.apache.hadoop.mapreduce.v2.app.job.Task) TaskId(org.apache.hadoop.mapreduce.v2.api.records.TaskId) TaskAttemptId(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId) Job(org.apache.hadoop.mapreduce.v2.app.job.Job)

Aggregations

Task (org.apache.hadoop.mapreduce.v2.app.job.Task)157 Test (org.junit.Test)153 Job (org.apache.hadoop.mapreduce.v2.app.job.Job)150 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)107 TaskAttempt (org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt)94 TaskId (org.apache.hadoop.mapreduce.v2.api.records.TaskId)79 TaskAttemptId (org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId)73 Configuration (org.apache.hadoop.conf.Configuration)68 ClientResponse (com.sun.jersey.api.client.ClientResponse)56 WebResource (com.sun.jersey.api.client.WebResource)56 TaskAttemptEvent (org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent)52 JSONObject (org.codehaus.jettison.json.JSONObject)46 AppContext (org.apache.hadoop.mapreduce.v2.app.AppContext)25 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)23 Path (org.apache.hadoop.fs.Path)22 MapTaskAttemptImpl (org.apache.hadoop.mapred.MapTaskAttemptImpl)20 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)20 HashMap (java.util.HashMap)19 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)19 JobConf (org.apache.hadoop.mapred.JobConf)16