use of org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent in project hadoop by apache.
the class TaskImpl method recover.
/**
* Recover a completed task from a previous application attempt
* @param taskInfo recovered info about the task
* @param recoverTaskOutput whether to recover task outputs
* @return state of the task after recovery
*/
private TaskStateInternal recover(TaskInfo taskInfo, OutputCommitter committer, boolean recoverTaskOutput) {
LOG.info("Recovering task " + taskId + " from prior app attempt, status was " + taskInfo.getTaskStatus());
scheduledTime = taskInfo.getStartTime();
sendTaskStartedEvent();
Collection<TaskAttemptInfo> attemptInfos = taskInfo.getAllTaskAttempts().values();
if (attemptInfos.size() > 0) {
metrics.launchedTask(this);
}
// recover the attempts for this task in the order they finished
// so task attempt completion events are ordered properly
int savedNextAttemptNumber = nextAttemptNumber;
ArrayList<TaskAttemptInfo> taInfos = new ArrayList<TaskAttemptInfo>(taskInfo.getAllTaskAttempts().values());
Collections.sort(taInfos, TA_INFO_COMPARATOR);
for (TaskAttemptInfo taInfo : taInfos) {
nextAttemptNumber = taInfo.getAttemptId().getId();
TaskAttemptImpl attempt = addAttempt(Avataar.VIRGIN);
// handle the recovery inline so attempts complete before task does
attempt.handle(new TaskAttemptRecoverEvent(attempt.getID(), taInfo, committer, recoverTaskOutput));
finishedAttempts.add(attempt.getID());
TaskAttemptCompletionEventStatus taces = null;
TaskAttemptState attemptState = attempt.getState();
switch(attemptState) {
case FAILED:
taces = TaskAttemptCompletionEventStatus.FAILED;
break;
case KILLED:
taces = TaskAttemptCompletionEventStatus.KILLED;
break;
case SUCCEEDED:
taces = TaskAttemptCompletionEventStatus.SUCCEEDED;
break;
default:
throw new IllegalStateException("Unexpected attempt state during recovery: " + attemptState);
}
if (attemptState == TaskAttemptState.FAILED) {
failedAttempts.add(attempt.getID());
if (failedAttempts.size() >= maxAttempts) {
taces = TaskAttemptCompletionEventStatus.TIPFAILED;
}
}
// TODO: this shouldn't be necessary after MAPREDUCE-4330
if (successfulAttempt == null) {
handleTaskAttemptCompletion(attempt.getID(), taces);
if (attemptState == TaskAttemptState.SUCCEEDED) {
successfulAttempt = attempt.getID();
}
}
}
nextAttemptNumber = savedNextAttemptNumber;
TaskStateInternal taskState = TaskStateInternal.valueOf(taskInfo.getTaskStatus());
switch(taskState) {
case SUCCEEDED:
if (successfulAttempt != null) {
sendTaskSucceededEvents();
} else {
LOG.info("Missing successful attempt for task " + taskId + ", recovering as RUNNING");
// there must have been a fetch failure and the retry wasn't complete
taskState = TaskStateInternal.RUNNING;
metrics.runningTask(this);
addAndScheduleAttempt(Avataar.VIRGIN);
}
break;
case FAILED:
case KILLED:
{
if (taskState == TaskStateInternal.KILLED && attemptInfos.size() == 0) {
metrics.endWaitingTask(this);
}
TaskFailedEvent tfe = new TaskFailedEvent(taskInfo.getTaskId(), taskInfo.getFinishTime(), taskInfo.getTaskType(), taskInfo.getError(), taskInfo.getTaskStatus(), taskInfo.getFailedDueToAttemptId(), taskInfo.getCounters());
eventHandler.handle(new JobHistoryEvent(taskId.getJobId(), tfe));
eventHandler.handle(new JobTaskEvent(taskId, getExternalState(taskState)));
break;
}
default:
throw new java.lang.AssertionError("Unexpected recovered task state: " + taskState);
}
return taskState;
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent in project hadoop by apache.
the class TestJobImpl method completeJobTasks.
private static void completeJobTasks(JobImpl job) {
// complete the map tasks and the reduce tasks so we start committing
int numMaps = job.getTotalMaps();
for (int i = 0; i < numMaps; ++i) {
job.handle(new JobTaskEvent(MRBuilderUtils.newTaskId(job.getID(), 1, TaskType.MAP), TaskState.SUCCEEDED));
Assert.assertEquals(JobState.RUNNING, job.getState());
}
int numReduces = job.getTotalReduces();
for (int i = 0; i < numReduces; ++i) {
job.handle(new JobTaskEvent(MRBuilderUtils.newTaskId(job.getID(), 1, TaskType.MAP), TaskState.SUCCEEDED));
Assert.assertEquals(JobState.RUNNING, job.getState());
}
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent in project hadoop by apache.
the class TestJobImpl method testUnusableNodeTransition.
@Test(timeout = 20000)
public void testUnusableNodeTransition() throws Exception {
Configuration conf = new Configuration();
conf.set(MRJobConfig.MR_AM_STAGING_DIR, stagingDir);
conf.setInt(MRJobConfig.NUM_REDUCES, 1);
DrainDispatcher dispatcher = new DrainDispatcher();
dispatcher.init(conf);
dispatcher.start();
CyclicBarrier syncBarrier = new CyclicBarrier(2);
OutputCommitter committer = new TestingOutputCommitter(syncBarrier, true);
CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer);
commitHandler.init(conf);
commitHandler.start();
final JobImpl job = createRunningStubbedJob(conf, dispatcher, 2, null);
// add a special task event handler to put the task back to running in case
// of task rescheduling/killing
EventHandler<TaskAttemptEvent> taskAttemptEventHandler = new EventHandler<TaskAttemptEvent>() {
@Override
public void handle(TaskAttemptEvent event) {
if (event.getType() == TaskAttemptEventType.TA_KILL) {
job.decrementSucceededMapperCount();
}
}
};
dispatcher.register(TaskAttemptEventType.class, taskAttemptEventHandler);
// replace the tasks with spied versions to return the right attempts
Map<TaskId, Task> spiedTasks = new HashMap<TaskId, Task>();
List<NodeReport> nodeReports = new ArrayList<NodeReport>();
Map<NodeReport, TaskId> nodeReportsToTaskIds = new HashMap<NodeReport, TaskId>();
for (Map.Entry<TaskId, Task> e : job.tasks.entrySet()) {
TaskId taskId = e.getKey();
Task task = e.getValue();
if (taskId.getTaskType() == TaskType.MAP) {
// add an attempt to the task to simulate nodes
NodeId nodeId = mock(NodeId.class);
TaskAttempt attempt = mock(TaskAttempt.class);
when(attempt.getNodeId()).thenReturn(nodeId);
TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
when(attempt.getID()).thenReturn(attemptId);
// create a spied task
Task spied = spy(task);
doReturn(attempt).when(spied).getAttempt(any(TaskAttemptId.class));
spiedTasks.put(taskId, spied);
// create a NodeReport based on the node id
NodeReport report = mock(NodeReport.class);
when(report.getNodeState()).thenReturn(NodeState.UNHEALTHY);
when(report.getNodeId()).thenReturn(nodeId);
nodeReports.add(report);
nodeReportsToTaskIds.put(report, taskId);
}
}
// replace the tasks with the spied tasks
job.tasks.putAll(spiedTasks);
// complete all mappers first
for (TaskId taskId : job.tasks.keySet()) {
if (taskId.getTaskType() == TaskType.MAP) {
// generate a task attempt completed event first to populate the
// nodes-to-succeeded-attempts map
TaskAttemptCompletionEvent tce = Records.newRecord(TaskAttemptCompletionEvent.class);
TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
tce.setAttemptId(attemptId);
tce.setStatus(TaskAttemptCompletionEventStatus.SUCCEEDED);
job.handle(new JobTaskAttemptCompletedEvent(tce));
// complete the task itself
job.handle(new JobTaskEvent(taskId, TaskState.SUCCEEDED));
Assert.assertEquals(JobState.RUNNING, job.getState());
}
}
// add an event for a node transition
NodeReport firstMapperNodeReport = nodeReports.get(0);
NodeReport secondMapperNodeReport = nodeReports.get(1);
job.handle(new JobUpdatedNodesEvent(job.getID(), Collections.singletonList(firstMapperNodeReport)));
dispatcher.await();
// complete the reducer
for (TaskId taskId : job.tasks.keySet()) {
if (taskId.getTaskType() == TaskType.REDUCE) {
job.handle(new JobTaskEvent(taskId, TaskState.SUCCEEDED));
}
}
// add another event for a node transition for the other mapper
// this should not trigger rescheduling
job.handle(new JobUpdatedNodesEvent(job.getID(), Collections.singletonList(secondMapperNodeReport)));
// complete the first mapper that was rescheduled
TaskId firstMapper = nodeReportsToTaskIds.get(firstMapperNodeReport);
job.handle(new JobTaskEvent(firstMapper, TaskState.SUCCEEDED));
// verify the state is moving to committing
assertJobState(job, JobStateInternal.COMMITTING);
// let the committer complete and verify the job succeeds
syncBarrier.await();
assertJobState(job, JobStateInternal.SUCCEEDED);
dispatcher.stop();
commitHandler.stop();
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent in project hadoop by apache.
the class TaskImpl method sendTaskSucceededEvents.
private void sendTaskSucceededEvents() {
eventHandler.handle(new JobTaskEvent(taskId, TaskState.SUCCEEDED));
LOG.info("Task succeeded with attempt " + successfulAttempt);
if (historyTaskStartGenerated) {
TaskFinishedEvent tfe = createTaskFinishedEvent(this, TaskStateInternal.SUCCEEDED);
eventHandler.handle(new JobHistoryEvent(taskId.getJobId(), tfe));
}
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent in project hadoop by apache.
the class TestRecovery method recoveryChecker.
private void recoveryChecker(MapTaskImpl checkTask, TaskState finalState, Map<TaskAttemptID, TaskAttemptState> finalAttemptStates, ArgumentCaptor<Event> arg, List<EventType> expectedJobHistoryEvents, long expectedMapLaunches, long expectedFailedMaps) {
assertEquals("Final State of Task", finalState, checkTask.getState());
Map<TaskAttemptId, TaskAttempt> recoveredAttempts = checkTask.getAttempts();
assertEquals("Expected Number of Task Attempts", finalAttemptStates.size(), recoveredAttempts.size());
for (TaskAttemptID taID : finalAttemptStates.keySet()) {
assertEquals("Expected Task Attempt State", finalAttemptStates.get(taID), recoveredAttempts.get(TypeConverter.toYarn(taID)).getState());
}
Iterator<Event> ie = arg.getAllValues().iterator();
int eventNum = 0;
long totalLaunchedMaps = 0;
long totalFailedMaps = 0;
boolean jobTaskEventReceived = false;
while (ie.hasNext()) {
Object current = ie.next();
++eventNum;
LOG.info(eventNum + " " + current.getClass().getName());
if (current instanceof JobHistoryEvent) {
JobHistoryEvent jhe = (JobHistoryEvent) current;
LOG.info(expectedJobHistoryEvents.get(0).toString() + " " + jhe.getHistoryEvent().getEventType().toString() + " " + jhe.getJobID());
assertEquals(expectedJobHistoryEvents.get(0), jhe.getHistoryEvent().getEventType());
expectedJobHistoryEvents.remove(0);
} else if (current instanceof JobCounterUpdateEvent) {
JobCounterUpdateEvent jcue = (JobCounterUpdateEvent) current;
boolean containsUpdates = jcue.getCounterUpdates().size() > 0;
// TaskAttempt recovery. Check that first.
if (containsUpdates) {
LOG.info("JobCounterUpdateEvent " + jcue.getCounterUpdates().get(0).getCounterKey() + " " + jcue.getCounterUpdates().get(0).getIncrementValue());
if (jcue.getCounterUpdates().get(0).getCounterKey() == JobCounter.NUM_FAILED_MAPS) {
totalFailedMaps += jcue.getCounterUpdates().get(0).getIncrementValue();
} else if (jcue.getCounterUpdates().get(0).getCounterKey() == JobCounter.TOTAL_LAUNCHED_MAPS) {
totalLaunchedMaps += jcue.getCounterUpdates().get(0).getIncrementValue();
}
}
} else if (current instanceof JobTaskEvent) {
JobTaskEvent jte = (JobTaskEvent) current;
assertEquals(jte.getState(), finalState);
jobTaskEventReceived = true;
}
}
assertTrue(jobTaskEventReceived || (finalState == TaskState.RUNNING));
assertEquals("Did not process all expected JobHistoryEvents", 0, expectedJobHistoryEvents.size());
assertEquals("Expected Map Launches", expectedMapLaunches, totalLaunchedMaps);
assertEquals("Expected Failed Maps", expectedFailedMaps, totalFailedMaps);
}
Aggregations