use of org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent in project hadoop by apache.
the class TaskAttemptImpl method handle.
@SuppressWarnings("unchecked")
@Override
public void handle(TaskAttemptEvent event) {
if (LOG.isDebugEnabled()) {
LOG.debug("Processing " + event.getTaskAttemptID() + " of type " + event.getType());
}
writeLock.lock();
try {
final TaskAttemptStateInternal oldState = getInternalState();
try {
stateMachine.doTransition(event.getType(), event);
} catch (InvalidStateTransitionException e) {
LOG.error("Can't handle this event at current state for " + this.attemptId, e);
eventHandler.handle(new JobDiagnosticsUpdateEvent(this.attemptId.getTaskId().getJobId(), "Invalid event " + event.getType() + " on TaskAttempt " + this.attemptId));
eventHandler.handle(new JobEvent(this.attemptId.getTaskId().getJobId(), JobEventType.INTERNAL_ERROR));
}
if (oldState != getInternalState()) {
if (getInternalState() == TaskAttemptStateInternal.FAILED) {
String nodeId = null == this.container ? "Not-assigned" : this.container.getNodeId().toString();
LOG.info(attemptId + " transitioned from state " + oldState + " to " + getInternalState() + ", event type is " + event.getType() + " and nodeId=" + nodeId);
} else {
LOG.info(attemptId + " TaskAttempt Transitioned from " + oldState + " to " + getInternalState());
}
}
} finally {
writeLock.unlock();
}
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent in project hadoop by apache.
the class TestLocalContainerLauncher method testKillJob.
@SuppressWarnings("rawtypes")
@Test(timeout = 10000)
public void testKillJob() throws Exception {
JobConf conf = new JobConf();
AppContext context = mock(AppContext.class);
// a simple event handler solely to detect the container cleaned event
final CountDownLatch isDone = new CountDownLatch(1);
EventHandler<Event> handler = new EventHandler<Event>() {
@Override
public void handle(Event event) {
LOG.info("handling event " + event.getClass() + " with type " + event.getType());
if (event instanceof TaskAttemptEvent) {
if (event.getType() == TaskAttemptEventType.TA_CONTAINER_CLEANED) {
isDone.countDown();
}
}
}
};
when(context.getEventHandler()).thenReturn(handler);
// create and start the launcher
LocalContainerLauncher launcher = new LocalContainerLauncher(context, mock(TaskUmbilicalProtocol.class));
launcher.init(conf);
launcher.start();
// create mocked job, task, and task attempt
// a single-mapper job
JobId jobId = MRBuilderUtils.newJobId(System.currentTimeMillis(), 1, 1);
TaskId taskId = MRBuilderUtils.newTaskId(jobId, 1, TaskType.MAP);
TaskAttemptId taId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
Job job = mock(Job.class);
when(job.getTotalMaps()).thenReturn(1);
when(job.getTotalReduces()).thenReturn(0);
Map<JobId, Job> jobs = new HashMap<JobId, Job>();
jobs.put(jobId, job);
// app context returns the one and only job
when(context.getAllJobs()).thenReturn(jobs);
org.apache.hadoop.mapreduce.v2.app.job.Task ytask = mock(org.apache.hadoop.mapreduce.v2.app.job.Task.class);
when(ytask.getType()).thenReturn(TaskType.MAP);
when(job.getTask(taskId)).thenReturn(ytask);
// create a sleeping mapper that runs beyond the test timeout
MapTask mapTask = mock(MapTask.class);
when(mapTask.isMapOrReduce()).thenReturn(true);
when(mapTask.isMapTask()).thenReturn(true);
TaskAttemptID taskID = TypeConverter.fromYarn(taId);
when(mapTask.getTaskID()).thenReturn(taskID);
when(mapTask.getJobID()).thenReturn(taskID.getJobID());
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
// sleep for a long time
LOG.info("sleeping for 5 minutes...");
Thread.sleep(5 * 60 * 1000);
return null;
}
}).when(mapTask).run(isA(JobConf.class), isA(TaskUmbilicalProtocol.class));
// pump in a task attempt launch event
ContainerLauncherEvent launchEvent = new ContainerRemoteLaunchEvent(taId, null, createMockContainer(), mapTask);
launcher.handle(launchEvent);
Thread.sleep(200);
// now pump in a container clean-up event
ContainerLauncherEvent cleanupEvent = new ContainerLauncherEvent(taId, null, null, null, ContainerLauncher.EventType.CONTAINER_REMOTE_CLEANUP);
launcher.handle(cleanupEvent);
// wait for the event to fire: this should be received promptly
isDone.await();
launcher.close();
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent in project hadoop by apache.
the class TestFetchFailure method testFetchFailureMultipleReduces.
@Test
public void testFetchFailureMultipleReduces() throws Exception {
MRApp app = new MRApp(1, 3, false, this.getClass().getName(), true);
Configuration conf = new Configuration();
// map -> reduce -> fetch-failure -> map retry is incompatible with
// sequential, single-task-attempt approach in uber-AM, so disable:
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
//all maps would be running
Assert.assertEquals("Num tasks not correct", 4, job.getTasks().size());
Iterator<Task> it = job.getTasks().values().iterator();
Task mapTask = it.next();
Task reduceTask = it.next();
Task reduceTask2 = it.next();
Task reduceTask3 = it.next();
//wait for Task state move to RUNNING
app.waitForState(mapTask, TaskState.RUNNING);
TaskAttempt mapAttempt1 = mapTask.getAttempts().values().iterator().next();
app.waitForState(mapAttempt1, TaskAttemptState.RUNNING);
//send the done signal to the map attempt
app.getContext().getEventHandler().handle(new TaskAttemptEvent(mapAttempt1.getID(), TaskAttemptEventType.TA_DONE));
// wait for map success
app.waitForState(mapTask, TaskState.SUCCEEDED);
TaskAttemptCompletionEvent[] events = job.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Num completion events not correct", 1, events.length);
Assert.assertEquals("Event status not correct", TaskAttemptCompletionEventStatus.SUCCEEDED, events[0].getStatus());
// wait for reduce to start running
app.waitForState(reduceTask, TaskState.RUNNING);
app.waitForState(reduceTask2, TaskState.RUNNING);
app.waitForState(reduceTask3, TaskState.RUNNING);
TaskAttempt reduceAttempt = reduceTask.getAttempts().values().iterator().next();
app.waitForState(reduceAttempt, TaskAttemptState.RUNNING);
updateStatus(app, reduceAttempt, Phase.SHUFFLE);
TaskAttempt reduceAttempt2 = reduceTask2.getAttempts().values().iterator().next();
app.waitForState(reduceAttempt2, TaskAttemptState.RUNNING);
updateStatus(app, reduceAttempt2, Phase.SHUFFLE);
TaskAttempt reduceAttempt3 = reduceTask3.getAttempts().values().iterator().next();
app.waitForState(reduceAttempt3, TaskAttemptState.RUNNING);
updateStatus(app, reduceAttempt3, Phase.SHUFFLE);
//send 2 fetch failures from reduce to prepare for map re execution
sendFetchFailure(app, reduceAttempt, mapAttempt1, "host1");
sendFetchFailure(app, reduceAttempt2, mapAttempt1, "host2");
//We should not re-launch the map task yet
assertEquals(TaskState.SUCCEEDED, mapTask.getState());
updateStatus(app, reduceAttempt2, Phase.REDUCE);
updateStatus(app, reduceAttempt3, Phase.REDUCE);
//send 3rd fetch failures from reduce to trigger map re execution
sendFetchFailure(app, reduceAttempt3, mapAttempt1, "host3");
//wait for map Task state move back to RUNNING
app.waitForState(mapTask, TaskState.RUNNING);
//map attempt must have become FAILED
Assert.assertEquals("Map TaskAttempt state not correct", TaskAttemptState.FAILED, mapAttempt1.getState());
Assert.assertEquals(mapAttempt1.getDiagnostics().get(0), "Too many fetch failures. Failing the attempt. " + "Last failure reported by " + reduceAttempt3.getID().toString() + " from host host3");
Assert.assertEquals("Num attempts in Map Task not correct", 2, mapTask.getAttempts().size());
Iterator<TaskAttempt> atIt = mapTask.getAttempts().values().iterator();
atIt.next();
TaskAttempt mapAttempt2 = atIt.next();
app.waitForState(mapAttempt2, TaskAttemptState.RUNNING);
//send the done signal to the second map attempt
app.getContext().getEventHandler().handle(new TaskAttemptEvent(mapAttempt2.getID(), TaskAttemptEventType.TA_DONE));
// wait for map success
app.waitForState(mapTask, TaskState.SUCCEEDED);
//send done to reduce
app.getContext().getEventHandler().handle(new TaskAttemptEvent(reduceAttempt.getID(), TaskAttemptEventType.TA_DONE));
//send done to reduce
app.getContext().getEventHandler().handle(new TaskAttemptEvent(reduceAttempt2.getID(), TaskAttemptEventType.TA_DONE));
//send done to reduce
app.getContext().getEventHandler().handle(new TaskAttemptEvent(reduceAttempt3.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(job, JobState.SUCCEEDED);
//previous completion event now becomes obsolete
Assert.assertEquals("Event status not correct", TaskAttemptCompletionEventStatus.OBSOLETE, events[0].getStatus());
events = job.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Num completion events not correct", 6, events.length);
Assert.assertEquals("Event map attempt id not correct", mapAttempt1.getID(), events[0].getAttemptId());
Assert.assertEquals("Event map attempt id not correct", mapAttempt1.getID(), events[1].getAttemptId());
Assert.assertEquals("Event map attempt id not correct", mapAttempt2.getID(), events[2].getAttemptId());
Assert.assertEquals("Event reduce attempt id not correct", reduceAttempt.getID(), events[3].getAttemptId());
Assert.assertEquals("Event status not correct for map attempt1", TaskAttemptCompletionEventStatus.OBSOLETE, events[0].getStatus());
Assert.assertEquals("Event status not correct for map attempt1", TaskAttemptCompletionEventStatus.FAILED, events[1].getStatus());
Assert.assertEquals("Event status not correct for map attempt2", TaskAttemptCompletionEventStatus.SUCCEEDED, events[2].getStatus());
Assert.assertEquals("Event status not correct for reduce attempt1", TaskAttemptCompletionEventStatus.SUCCEEDED, events[3].getStatus());
TaskCompletionEvent[] mapEvents = job.getMapAttemptCompletionEvents(0, 2);
TaskCompletionEvent[] convertedEvents = TypeConverter.fromYarn(events);
Assert.assertEquals("Incorrect number of map events", 2, mapEvents.length);
Assert.assertArrayEquals("Unexpected map events", Arrays.copyOfRange(convertedEvents, 0, 2), mapEvents);
mapEvents = job.getMapAttemptCompletionEvents(2, 200);
Assert.assertEquals("Incorrect number of map events", 1, mapEvents.length);
Assert.assertEquals("Unexpected map event", convertedEvents[2], mapEvents[0]);
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent in project hadoop by apache.
the class TestMRClientService method test.
@Test
public void test() throws Exception {
MRAppWithClientService app = new MRAppWithClientService(1, 0, false);
Configuration conf = new Configuration();
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
Assert.assertEquals("Num tasks not correct", 1, job.getTasks().size());
Iterator<Task> it = job.getTasks().values().iterator();
Task task = it.next();
app.waitForState(task, TaskState.RUNNING);
TaskAttempt attempt = task.getAttempts().values().iterator().next();
app.waitForState(attempt, TaskAttemptState.RUNNING);
// send the diagnostic
String diagnostic1 = "Diagnostic1";
String diagnostic2 = "Diagnostic2";
app.getContext().getEventHandler().handle(new TaskAttemptDiagnosticsUpdateEvent(attempt.getID(), diagnostic1));
// send the status update
TaskAttemptStatus taskAttemptStatus = new TaskAttemptStatus();
taskAttemptStatus.id = attempt.getID();
taskAttemptStatus.progress = 0.5f;
taskAttemptStatus.stateString = "RUNNING";
taskAttemptStatus.taskState = TaskAttemptState.RUNNING;
taskAttemptStatus.phase = Phase.MAP;
// send the status update
app.getContext().getEventHandler().handle(new TaskAttemptStatusUpdateEvent(attempt.getID(), taskAttemptStatus));
//verify that all object are fully populated by invoking RPCs.
YarnRPC rpc = YarnRPC.create(conf);
MRClientProtocol proxy = (MRClientProtocol) rpc.getProxy(MRClientProtocol.class, app.clientService.getBindAddress(), conf);
GetCountersRequest gcRequest = recordFactory.newRecordInstance(GetCountersRequest.class);
gcRequest.setJobId(job.getID());
Assert.assertNotNull("Counters is null", proxy.getCounters(gcRequest).getCounters());
GetJobReportRequest gjrRequest = recordFactory.newRecordInstance(GetJobReportRequest.class);
gjrRequest.setJobId(job.getID());
JobReport jr = proxy.getJobReport(gjrRequest).getJobReport();
verifyJobReport(jr);
GetTaskAttemptCompletionEventsRequest gtaceRequest = recordFactory.newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
gtaceRequest.setJobId(job.getID());
gtaceRequest.setFromEventId(0);
gtaceRequest.setMaxEvents(10);
Assert.assertNotNull("TaskCompletionEvents is null", proxy.getTaskAttemptCompletionEvents(gtaceRequest).getCompletionEventList());
GetDiagnosticsRequest gdRequest = recordFactory.newRecordInstance(GetDiagnosticsRequest.class);
gdRequest.setTaskAttemptId(attempt.getID());
Assert.assertNotNull("Diagnostics is null", proxy.getDiagnostics(gdRequest).getDiagnosticsList());
GetTaskAttemptReportRequest gtarRequest = recordFactory.newRecordInstance(GetTaskAttemptReportRequest.class);
gtarRequest.setTaskAttemptId(attempt.getID());
TaskAttemptReport tar = proxy.getTaskAttemptReport(gtarRequest).getTaskAttemptReport();
verifyTaskAttemptReport(tar);
GetTaskReportRequest gtrRequest = recordFactory.newRecordInstance(GetTaskReportRequest.class);
gtrRequest.setTaskId(task.getID());
Assert.assertNotNull("TaskReport is null", proxy.getTaskReport(gtrRequest).getTaskReport());
GetTaskReportsRequest gtreportsRequest = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
gtreportsRequest.setJobId(job.getID());
gtreportsRequest.setTaskType(TaskType.MAP);
Assert.assertNotNull("TaskReports for map is null", proxy.getTaskReports(gtreportsRequest).getTaskReportList());
gtreportsRequest = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
gtreportsRequest.setJobId(job.getID());
gtreportsRequest.setTaskType(TaskType.REDUCE);
Assert.assertNotNull("TaskReports for reduce is null", proxy.getTaskReports(gtreportsRequest).getTaskReportList());
List<String> diag = proxy.getDiagnostics(gdRequest).getDiagnosticsList();
Assert.assertEquals("Num diagnostics not correct", 1, diag.size());
Assert.assertEquals("Diag 1 not correct", diagnostic1, diag.get(0).toString());
TaskReport taskReport = proxy.getTaskReport(gtrRequest).getTaskReport();
Assert.assertEquals("Num diagnostics not correct", 1, taskReport.getDiagnosticsCount());
//send the done signal to the task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task.getAttempts().values().iterator().next().getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(job, JobState.SUCCEEDED);
// For invalid jobid, throw IOException
gtreportsRequest = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
gtreportsRequest.setJobId(TypeConverter.toYarn(JobID.forName("job_1415730144495_0001")));
gtreportsRequest.setTaskType(TaskType.REDUCE);
try {
proxy.getTaskReports(gtreportsRequest);
fail("IOException not thrown for invalid job id");
} catch (IOException e) {
// Expected
}
}
use of org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent in project hadoop by apache.
the class TestTaskAttempt method testTooManyFetchFailureAfterKill.
@Test
public void testTooManyFetchFailureAfterKill() throws Exception {
ApplicationId appId = ApplicationId.newInstance(1, 2);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 0);
JobId jobId = MRBuilderUtils.newJobId(appId, 1);
TaskId taskId = MRBuilderUtils.newTaskId(jobId, 1, TaskType.MAP);
TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
Path jobFile = mock(Path.class);
MockEventHandler eventHandler = new MockEventHandler();
TaskAttemptListener taListener = mock(TaskAttemptListener.class);
when(taListener.getAddress()).thenReturn(new InetSocketAddress("localhost", 0));
JobConf jobConf = new JobConf();
jobConf.setClass("fs.file.impl", StubbedFS.class, FileSystem.class);
jobConf.setBoolean("fs.file.impl.disable.cache", true);
jobConf.set(JobConf.MAPRED_MAP_TASK_ENV, "");
jobConf.set(MRJobConfig.APPLICATION_ATTEMPT_ID, "10");
TaskSplitMetaInfo splits = mock(TaskSplitMetaInfo.class);
when(splits.getLocations()).thenReturn(new String[] { "127.0.0.1" });
AppContext appCtx = mock(AppContext.class);
ClusterInfo clusterInfo = mock(ClusterInfo.class);
Resource resource = mock(Resource.class);
when(appCtx.getClusterInfo()).thenReturn(clusterInfo);
when(resource.getMemorySize()).thenReturn(1024L);
setupTaskAttemptFinishingMonitor(eventHandler, jobConf, appCtx);
TaskAttemptImpl taImpl = new MapTaskAttemptImpl(taskId, 1, eventHandler, jobFile, 1, splits, jobConf, taListener, mock(Token.class), new Credentials(), SystemClock.getInstance(), appCtx);
NodeId nid = NodeId.newInstance("127.0.0.1", 0);
ContainerId contId = ContainerId.newContainerId(appAttemptId, 3);
Container container = mock(Container.class);
when(container.getId()).thenReturn(contId);
when(container.getNodeId()).thenReturn(nid);
when(container.getNodeHttpAddress()).thenReturn("localhost:0");
taImpl.handle(new TaskAttemptEvent(attemptId, TaskAttemptEventType.TA_SCHEDULE));
taImpl.handle(new TaskAttemptContainerAssignedEvent(attemptId, container, mock(Map.class)));
taImpl.handle(new TaskAttemptContainerLaunchedEvent(attemptId, 0));
taImpl.handle(new TaskAttemptEvent(attemptId, TaskAttemptEventType.TA_DONE));
taImpl.handle(new TaskAttemptEvent(attemptId, TaskAttemptEventType.TA_CONTAINER_COMPLETED));
assertEquals("Task attempt is not in succeeded state", taImpl.getState(), TaskAttemptState.SUCCEEDED);
taImpl.handle(new TaskAttemptEvent(attemptId, TaskAttemptEventType.TA_KILL));
assertEquals("Task attempt is not in KILLED state", taImpl.getState(), TaskAttemptState.KILLED);
taImpl.handle(new TaskAttemptEvent(attemptId, TaskAttemptEventType.TA_TOO_MANY_FETCH_FAILURE));
assertEquals("Task attempt is not in KILLED state, still", taImpl.getState(), TaskAttemptState.KILLED);
assertFalse("InternalError occurred trying to handle TA_CONTAINER_CLEANED", eventHandler.internalError);
}
Aggregations