use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.
the class TaskImpl method recover.
/**
* Recover a completed task from a previous application attempt
* @param taskInfo recovered info about the task
* @param recoverTaskOutput whether to recover task outputs
* @return state of the task after recovery
*/
private TaskStateInternal recover(TaskInfo taskInfo, OutputCommitter committer, boolean recoverTaskOutput) {
LOG.info("Recovering task " + taskId + " from prior app attempt, status was " + taskInfo.getTaskStatus());
scheduledTime = taskInfo.getStartTime();
sendTaskStartedEvent();
Collection<TaskAttemptInfo> attemptInfos = taskInfo.getAllTaskAttempts().values();
if (attemptInfos.size() > 0) {
metrics.launchedTask(this);
}
// recover the attempts for this task in the order they finished
// so task attempt completion events are ordered properly
int savedNextAttemptNumber = nextAttemptNumber;
ArrayList<TaskAttemptInfo> taInfos = new ArrayList<TaskAttemptInfo>(taskInfo.getAllTaskAttempts().values());
Collections.sort(taInfos, TA_INFO_COMPARATOR);
for (TaskAttemptInfo taInfo : taInfos) {
nextAttemptNumber = taInfo.getAttemptId().getId();
TaskAttemptImpl attempt = addAttempt(Avataar.VIRGIN);
// handle the recovery inline so attempts complete before task does
attempt.handle(new TaskAttemptRecoverEvent(attempt.getID(), taInfo, committer, recoverTaskOutput));
finishedAttempts.add(attempt.getID());
TaskAttemptCompletionEventStatus taces = null;
TaskAttemptState attemptState = attempt.getState();
switch(attemptState) {
case FAILED:
taces = TaskAttemptCompletionEventStatus.FAILED;
break;
case KILLED:
taces = TaskAttemptCompletionEventStatus.KILLED;
break;
case SUCCEEDED:
taces = TaskAttemptCompletionEventStatus.SUCCEEDED;
break;
default:
throw new IllegalStateException("Unexpected attempt state during recovery: " + attemptState);
}
if (attemptState == TaskAttemptState.FAILED) {
failedAttempts.add(attempt.getID());
if (failedAttempts.size() >= maxAttempts) {
taces = TaskAttemptCompletionEventStatus.TIPFAILED;
}
}
// TODO: this shouldn't be necessary after MAPREDUCE-4330
if (successfulAttempt == null) {
handleTaskAttemptCompletion(attempt.getID(), taces);
if (attemptState == TaskAttemptState.SUCCEEDED) {
successfulAttempt = attempt.getID();
}
}
}
nextAttemptNumber = savedNextAttemptNumber;
TaskStateInternal taskState = TaskStateInternal.valueOf(taskInfo.getTaskStatus());
switch(taskState) {
case SUCCEEDED:
if (successfulAttempt != null) {
sendTaskSucceededEvents();
} else {
LOG.info("Missing successful attempt for task " + taskId + ", recovering as RUNNING");
// there must have been a fetch failure and the retry wasn't complete
taskState = TaskStateInternal.RUNNING;
metrics.runningTask(this);
addAndScheduleAttempt(Avataar.VIRGIN);
}
break;
case FAILED:
case KILLED:
{
if (taskState == TaskStateInternal.KILLED && attemptInfos.size() == 0) {
metrics.endWaitingTask(this);
}
TaskFailedEvent tfe = new TaskFailedEvent(taskInfo.getTaskId(), taskInfo.getFinishTime(), taskInfo.getTaskType(), taskInfo.getError(), taskInfo.getTaskStatus(), taskInfo.getFailedDueToAttemptId(), taskInfo.getCounters());
eventHandler.handle(new JobHistoryEvent(taskId.getJobId(), tfe));
eventHandler.handle(new JobTaskEvent(taskId, getExternalState(taskState)));
break;
}
default:
throw new java.lang.AssertionError("Unexpected recovered task state: " + taskState);
}
return taskState;
}
use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.
the class TaskAttemptImpl method notifyTaskAttemptFailed.
@SuppressWarnings("unchecked")
private static void notifyTaskAttemptFailed(TaskAttemptImpl taskAttempt) {
if (taskAttempt.getLaunchTime() == 0) {
sendJHStartEventForAssignedFailTask(taskAttempt);
}
// set the finish time
taskAttempt.setFinishTime();
taskAttempt.eventHandler.handle(createJobCounterUpdateEventTAFailed(taskAttempt, false));
TaskAttemptUnsuccessfulCompletionEvent tauce = createTaskAttemptUnsuccessfulCompletionEvent(taskAttempt, TaskAttemptStateInternal.FAILED);
taskAttempt.eventHandler.handle(new JobHistoryEvent(taskAttempt.attemptId.getTaskId().getJobId(), tauce));
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(taskAttempt.attemptId, TaskEventType.T_ATTEMPT_FAILED));
}
use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.
the class TaskAttemptImpl method createJobCounterUpdateEventTAFailed.
private static JobCounterUpdateEvent createJobCounterUpdateEventTAFailed(TaskAttemptImpl taskAttempt, boolean taskAlreadyCompleted) {
TaskType taskType = taskAttempt.getID().getTaskId().getTaskType();
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskAttempt.getID().getTaskId().getJobId());
if (taskType == TaskType.MAP) {
jce.addCounterUpdate(JobCounter.NUM_FAILED_MAPS, 1);
} else {
jce.addCounterUpdate(JobCounter.NUM_FAILED_REDUCES, 1);
}
if (!taskAlreadyCompleted) {
updateMillisCounters(jce, taskAttempt);
}
return jce;
}
use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.
the class TaskAttemptImpl method createJobCounterUpdateEventTASucceeded.
private static JobCounterUpdateEvent createJobCounterUpdateEventTASucceeded(TaskAttemptImpl taskAttempt) {
TaskId taskId = taskAttempt.attemptId.getTaskId();
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskId.getJobId());
updateMillisCounters(jce, taskAttempt);
return jce;
}
use of org.apache.hadoop.mapreduce.v2.app.job.impl.TaskAttemptImpl in project hadoop by apache.
the class TestContainerLauncher method testSlowNM.
@Test(timeout = 15000)
public void testSlowNM() throws Exception {
conf = new Configuration();
int maxAttempts = 1;
conf.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, maxAttempts);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
// set timeout low for the test
conf.setInt("yarn.rpc.nm-command-timeout", 3000);
conf.set(YarnConfiguration.IPC_RPC_IMPL, HadoopYarnProtoRPC.class.getName());
YarnRPC rpc = YarnRPC.create(conf);
String bindAddr = "localhost:0";
InetSocketAddress addr = NetUtils.createSocketAddr(bindAddr);
NMTokenSecretManagerInNM tokenSecretManager = new NMTokenSecretManagerInNM();
MasterKey masterKey = Records.newRecord(MasterKey.class);
masterKey.setBytes(ByteBuffer.wrap("key".getBytes()));
tokenSecretManager.setMasterKey(masterKey);
conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "token");
server = rpc.getServer(ContainerManagementProtocol.class, new DummyContainerManager(), addr, conf, tokenSecretManager, 1);
server.start();
MRApp app = new MRAppWithSlowNM(tokenSecretManager);
try {
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
Map<TaskId, Task> tasks = job.getTasks();
Assert.assertEquals("Num tasks is not correct", 1, tasks.size());
Task task = tasks.values().iterator().next();
app.waitForState(task, TaskState.SCHEDULED);
Map<TaskAttemptId, TaskAttempt> attempts = tasks.values().iterator().next().getAttempts();
Assert.assertEquals("Num attempts is not correct", maxAttempts, attempts.size());
TaskAttempt attempt = attempts.values().iterator().next();
app.waitForInternalState((TaskAttemptImpl) attempt, TaskAttemptStateInternal.ASSIGNED);
app.waitForState(job, JobState.FAILED);
String diagnostics = attempt.getDiagnostics().toString();
LOG.info("attempt.getDiagnostics: " + diagnostics);
Assert.assertTrue(diagnostics.contains("Container launch failed for " + "container_0_0000_01_000000 : "));
Assert.assertTrue(diagnostics.contains("java.net.SocketTimeoutException: 3000 millis timeout while waiting for channel"));
} finally {
server.stop();
app.stop();
}
}
Aggregations