Search in sources :

Example 1 with TaskAttemptImpl

use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.

the class TaskImpl method recover.

/**
   * Recover a completed task from a previous application attempt
   * @param taskInfo recovered info about the task
   * @param recoverTaskOutput whether to recover task outputs
   * @return state of the task after recovery
   */
private TaskStateInternal recover(TaskInfo taskInfo, OutputCommitter committer, boolean recoverTaskOutput) {
    LOG.info("Recovering task " + taskId + " from prior app attempt, status was " + taskInfo.getTaskStatus());
    scheduledTime = taskInfo.getStartTime();
    sendTaskStartedEvent();
    Collection<TaskAttemptInfo> attemptInfos = taskInfo.getAllTaskAttempts().values();
    if (attemptInfos.size() > 0) {
        metrics.launchedTask(this);
    }
    // recover the attempts for this task in the order they finished
    // so task attempt completion events are ordered properly
    int savedNextAttemptNumber = nextAttemptNumber;
    ArrayList<TaskAttemptInfo> taInfos = new ArrayList<TaskAttemptInfo>(taskInfo.getAllTaskAttempts().values());
    Collections.sort(taInfos, TA_INFO_COMPARATOR);
    for (TaskAttemptInfo taInfo : taInfos) {
        nextAttemptNumber = taInfo.getAttemptId().getId();
        TaskAttemptImpl attempt = addAttempt(Avataar.VIRGIN);
        // handle the recovery inline so attempts complete before task does
        attempt.handle(new TaskAttemptRecoverEvent(attempt.getID(), taInfo, committer, recoverTaskOutput));
        finishedAttempts.add(attempt.getID());
        TaskAttemptCompletionEventStatus taces = null;
        TaskAttemptState attemptState = attempt.getState();
        switch(attemptState) {
            case FAILED:
                taces = TaskAttemptCompletionEventStatus.FAILED;
                break;
            case KILLED:
                taces = TaskAttemptCompletionEventStatus.KILLED;
                break;
            case SUCCEEDED:
                taces = TaskAttemptCompletionEventStatus.SUCCEEDED;
                break;
            default:
                throw new IllegalStateException("Unexpected attempt state during recovery: " + attemptState);
        }
        if (attemptState == TaskAttemptState.FAILED) {
            failedAttempts.add(attempt.getID());
            if (failedAttempts.size() >= maxAttempts) {
                taces = TaskAttemptCompletionEventStatus.TIPFAILED;
            }
        }
        // TODO: this shouldn't be necessary after MAPREDUCE-4330
        if (successfulAttempt == null) {
            handleTaskAttemptCompletion(attempt.getID(), taces);
            if (attemptState == TaskAttemptState.SUCCEEDED) {
                successfulAttempt = attempt.getID();
            }
        }
    }
    nextAttemptNumber = savedNextAttemptNumber;
    TaskStateInternal taskState = TaskStateInternal.valueOf(taskInfo.getTaskStatus());
    switch(taskState) {
        case SUCCEEDED:
            if (successfulAttempt != null) {
                sendTaskSucceededEvents();
            } else {
                LOG.info("Missing successful attempt for task " + taskId + ", recovering as RUNNING");
                // there must have been a fetch failure and the retry wasn't complete
                taskState = TaskStateInternal.RUNNING;
                metrics.runningTask(this);
                addAndScheduleAttempt(Avataar.VIRGIN);
            }
            break;
        case FAILED:
        case KILLED:
            {
                if (taskState == TaskStateInternal.KILLED && attemptInfos.size() == 0) {
                    metrics.endWaitingTask(this);
                }
                TaskFailedEvent tfe = new TaskFailedEvent(taskInfo.getTaskId(), taskInfo.getFinishTime(), taskInfo.getTaskType(), taskInfo.getError(), taskInfo.getTaskStatus(), taskInfo.getFailedDueToAttemptId(), taskInfo.getCounters());
                eventHandler.handle(new JobHistoryEvent(taskId.getJobId(), tfe));
                eventHandler.handle(new JobTaskEvent(taskId, getExternalState(taskState)));
                break;
            }
        default:
            throw new java.lang.AssertionError("Unexpected recovered task state: " + taskState);
    }
    return taskState;
}
Also used : TaskStateInternal(org.apache.hadoop.mapreduce.v2.app.job.TaskStateInternal) ArrayList(java.util.ArrayList) TaskAttemptCompletionEventStatus(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEventStatus) JobHistoryEvent(org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent) TaskAttemptState(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptState) JobTaskEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent) TaskAttemptInfo(org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo) TaskFailedEvent(org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent) TaskAttemptRecoverEvent(org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptRecoverEvent)

Example 2 with TaskAttemptImpl

use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.

the class TaskAttemptImpl method notifyTaskAttemptFailed.

@SuppressWarnings("unchecked")
private static void notifyTaskAttemptFailed(TaskAttemptImpl taskAttempt) {
    if (taskAttempt.getLaunchTime() == 0) {
        sendJHStartEventForAssignedFailTask(taskAttempt);
    }
    // set the finish time
    taskAttempt.setFinishTime();
    taskAttempt.eventHandler.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
    TaskAttemptUnsuccessfulCompletionEvent tauce = createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt, TaskAttemptStateInternal.FAILED);
    taskAttempt.eventHandler.handle(new JobHistoryEvent(taskAttempt.attemptId.getTaskId().getJobId(), tauce));
    taskAttempt.eventHandler.handle(new TaskTAttemptEvent(taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED));
}
Also used : JobHistoryEvent(org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent) TaskTAttemptEvent(org.apache.hadoop.mapreduce.v2.app.job.event.TaskTAttemptEvent) TaskAttemptUnsuccessfulCompletionEvent(org.apache.hadoop.mapreduce.jobhistory.TaskAttemptUnsuccessfulCompletionEvent)

Example 3 with TaskAttemptImpl

use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.

the class TaskAttemptImpl method createJobCounterUpdateEventTAFailed.

private static JobCounterUpdateEvent createJobCounterUpdateEventTAFailed(TaskAttemptImpl taskAttempt, boolean taskAlreadyCompleted) {
    TaskType taskType = taskAttempt.getID().getTaskId().getTaskType();
    JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskAttempt.getID().getTaskId().getJobId());
    if (taskType == TaskType.MAP) {
        jce.addCounterUpdate(JobCounter.NUM_FAILED_MAPS, 1);
    } else {
        jce.addCounterUpdate(JobCounter.NUM_FAILED_REDUCES, 1);
    }
    if (!taskAlreadyCompleted) {
        updateMillisCounters(jce, taskAttempt);
    }
    return jce;
}
Also used : TaskType(org.apache.hadoop.mapreduce.v2.api.records.TaskType) JobCounterUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobCounterUpdateEvent)

Example 4 with TaskAttemptImpl

use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.

the class TaskAttemptImpl method createJobCounterUpdateEventTASucceeded.

private static JobCounterUpdateEvent createJobCounterUpdateEventTASucceeded(TaskAttemptImpl taskAttempt) {
    TaskId taskId = taskAttempt.attemptId.getTaskId();
    JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskId.getJobId());
    updateMillisCounters(jce, taskAttempt);
    return jce;
}
Also used : TaskId(org.apache.hadoop.mapreduce.v2.api.records.TaskId) JobCounterUpdateEvent(org.apache.hadoop.mapreduce.v2.app.job.event.JobCounterUpdateEvent)

Example 5 with TaskAttemptImpl

use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.

the class TestContainerLauncher method testSlowNM.

@Test(timeout = 15000)
public void testSlowNM() throws Exception {
    conf = new Configuration();
    int maxAttempts = 1;
    conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, maxAttempts);
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    // set timeout low for the test
    conf.setInt("yarn.rpc.nm-command-timeout", 3000);
    conf.set(YarnConfiguration.IPC_RPC_IMPL, HadoopYarnProtoRPC.class.getName());
    YarnRPC rpc = YarnRPC.create(conf);
    String bindAddr = "localhost:0";
    InetSocketAddress addr = NetUtils.createSocketAddr(bindAddr);
    NMTokenSecretManagerInNM tokenSecretManager = new NMTokenSecretManagerInNM();
    MasterKey masterKey = Records.newRecord(MasterKey.class);
    masterKey.setBytes(ByteBuffer.wrap("key".getBytes()));
    tokenSecretManager.setMasterKey(masterKey);
    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "token");
    server = rpc.getServer(ContainerManagementProtocol.class, new DummyContainerManager(), addr, conf, tokenSecretManager, 1);
    server.start();
    MRApp app = new MRAppWithSlowNM(tokenSecretManager);
    try {
        Job job = app.submit(conf);
        app.waitForState(job, JobState.RUNNING);
        Map<TaskId, Task> tasks = job.getTasks();
        Assert.assertEquals("Num tasks is not correct", 1, tasks.size());
        Task task = tasks.values().iterator().next();
        app.waitForState(task, TaskState.SCHEDULED);
        Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next().getAttempts();
        Assert.assertEquals("Num attempts is not correct", maxAttempts, attempts.size());
        TaskAttempt attempt = attempts.values().iterator().next();
        app.waitForInternalState((TaskAttemptImpl) attempt, TaskAttemptStateInternal.ASSIGNED);
        app.waitForState(job, JobState.FAILED);
        String diagnostics = attempt.getDiagnostics().toString();
        LOG.info("attempt.getDiagnostics: " + diagnostics);
        Assert.assertTrue(diagnostics.contains("Container launch failed for " + "container_0_0000_01_000000 : "));
        Assert.assertTrue(diagnostics.contains("java.net.SocketTimeoutException: 3000 millis timeout while waiting for channel"));
    } finally {
        server.stop();
        app.stop();
    }
}
Also used : Task(org.apache.hadoop.mapreduce.v2.app.job.Task) TaskId(org.apache.hadoop.mapreduce.v2.api.records.TaskId) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) InetSocketAddress(java.net.InetSocketAddress) TaskAttemptId(org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId) NMTokenSecretManagerInNM(org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM) YarnRPC(org.apache.hadoop.yarn.ipc.YarnRPC) HadoopYarnProtoRPC(org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC) ContainerManagementProtocol(org.apache.hadoop.yarn.api.ContainerManagementProtocol) MasterKey(org.apache.hadoop.yarn.server.api.records.MasterKey) TaskAttempt(org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt) Job(org.apache.hadoop.mapreduce.v2.app.job.Job) MRApp(org.apache.hadoop.mapreduce.v2.app.MRApp) Test(org.junit.Test)

Aggregations

Test (org.junit.Test)27 MapTaskAttemptImpl (org.apache.hadoop.mapred.MapTaskAttemptImpl)25 TaskAttemptEvent (org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent)22 TaskId (org.apache.hadoop.mapreduce.v2.api.records.TaskId)19 TaskSplitMetaInfo (org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo)17 Path (org.apache.hadoop.fs.Path)16 JobConf (org.apache.hadoop.mapred.JobConf)16 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)16 TaskAttemptListener (org.apache.hadoop.mapreduce.v2.app.TaskAttemptListener)16 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)16 InetSocketAddress (java.net.InetSocketAddress)15 Credentials (org.apache.hadoop.security.Credentials)15 Token (org.apache.hadoop.security.token.Token)15 TaskAttemptId (org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId)14 AppContext (org.apache.hadoop.mapreduce.v2.app.AppContext)14 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)14 TaskAttemptContainerAssignedEvent (org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptContainerAssignedEvent)12 Container (org.apache.hadoop.yarn.api.records.Container)12 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)12 NodeId (org.apache.hadoop.yarn.api.records.NodeId)12