use of org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled in project tez by apache.
the class TaskCommunicatorManager method heartbeat.
public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
if (LOG.isDebugEnabled()) {
LOG.debug("Received heartbeat from container" + ", request=" + request);
}
if (!registeredContainers.containsKey(containerId)) {
LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
return RESPONSE_SHOULD_DIE;
}
// A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
// meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
// the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
// know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
// So - avoiding synchronization.
pingContainerHeartbeatHandler(containerId);
TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
if (taskAttemptID != null) {
ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
// This can happen when a task heartbeats. Meanwhile the container is unregistered.
// The information will eventually make it through to the plugin via a corresponding unregister.
// There's a race in that case between the unregister making it through, and this method returning.
// TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
// so that the plugin can handle the scenario. Alternately augment the response with error codes.
// Error codes would be better than exceptions.
LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
return RESPONSE_SHOULD_DIE;
}
List<TezEvent> inEvents = request.getEvents();
if (LOG.isDebugEnabled()) {
LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
}
long currTime = context.getClock().getTime();
// taFinishedEvents - means the TaskAttemptFinishedEvent
// taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
// eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
// taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
// taFinishedEvents must be routed before taGeneratedEvents
List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
TaskAttemptEventStatusUpdate taskAttemptEvent = null;
boolean readErrorReported = false;
for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
// for now, set the event time on the AM when it is received.
// this avoids any time disparity between machines.
tezEvent.setEventReceivedTime(currTime);
final EventType eventType = tezEvent.getEventType();
if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
// send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
} else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
taFinishedEvents.add(tezEvent);
} else {
if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
readErrorReported = true;
}
if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
taGeneratedEvents.add(tezEvent);
}
eventsForVertex.add(tezEvent);
}
}
if (taskAttemptEvent != null) {
taskAttemptEvent.setReadErrorReported(readErrorReported);
sendEvent(taskAttemptEvent);
}
// route taGeneratedEvents to TaskAttempt
if (!taGeneratedEvents.isEmpty()) {
sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
}
// route events to TaskAttempt
Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
for (TezEvent e : taFinishedEvents) {
EventMetaData sourceMeta = e.getSourceInfo();
switch(e.getEventType()) {
case TASK_ATTEMPT_FAILED_EVENT:
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptTerminationCause errCause = null;
switch(sourceMeta.getEventGenerator()) {
case INPUT:
errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
break;
case PROCESSOR:
errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
break;
case OUTPUT:
errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
break;
case SYSTEM:
errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
break;
default:
throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
}
if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
} else {
// Killed
TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
}
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
break;
default:
throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
}
}
if (!eventsForVertex.isEmpty()) {
TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
}
taskHeartbeatHandler.pinged(taskAttemptID);
eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
}
return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
use of org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled in project tez by apache.
the class TestTaskImpl method testKilledTaskTransitionWithLaunchedAttempt.
@Test(timeout = 30000)
public void testKilledTaskTransitionWithLaunchedAttempt() throws InterruptedException {
TezTaskID taskId = getNewTaskID();
scheduleTaskAttempt(taskId);
MockTaskAttemptImpl firstMockTaskAttempt = mockTask.getLastAttempt();
launchTaskAttempt(firstMockTaskAttempt.getID());
mockTask.handle(createTaskTAAddSpecAttempt(mockTask.getLastAttempt().getID()));
MockTaskAttemptImpl secondMockTaskAttempt = mockTask.getLastAttempt();
launchTaskAttempt(secondMockTaskAttempt.getID());
firstMockTaskAttempt.handle(new TaskAttemptEventSchedule(TezTaskAttemptID.fromString(firstMockTaskAttempt.toString()), 10, 10));
secondMockTaskAttempt.handle(new TaskAttemptEventSchedule(TezTaskAttemptID.fromString(secondMockTaskAttempt.toString()), 10, 10));
firstMockTaskAttempt.handle(new TaskAttemptEventSubmitted(TezTaskAttemptID.fromString(firstMockTaskAttempt.toString()), mockContainer.getId()));
secondMockTaskAttempt.handle(new TaskAttemptEventSubmitted(TezTaskAttemptID.fromString(secondMockTaskAttempt.toString()), mockContainer.getId()));
secondMockTaskAttempt.handle(new TaskAttemptEventStartedRemotely(TezTaskAttemptID.fromString(secondMockTaskAttempt.toString())));
firstMockTaskAttempt.handle(new TaskAttemptEventStartedRemotely(TezTaskAttemptID.fromString(firstMockTaskAttempt.toString())));
mockTask.handle(new TaskEventTermination(mockTask.getTaskId(), TaskAttemptTerminationCause.FRAMEWORK_ERROR, "test"));
secondMockTaskAttempt.handle(new TaskAttemptEventAttemptKilled(TezTaskAttemptID.fromString(secondMockTaskAttempt.toString()), "test", TaskAttemptTerminationCause.FRAMEWORK_ERROR));
mockTask.handle(new TaskEventTAKilled(secondMockTaskAttempt.getID(), new TaskAttemptEvent(secondMockTaskAttempt.getID(), TaskAttemptEventType.TA_KILLED)));
firstMockTaskAttempt.handle(new TaskAttemptEventAttemptKilled(TezTaskAttemptID.fromString(firstMockTaskAttempt.toString()), "test", TaskAttemptTerminationCause.FRAMEWORK_ERROR));
mockTask.handle(new TaskEventTAKilled(firstMockTaskAttempt.getID(), new TaskAttemptEvent(firstMockTaskAttempt.getID(), TaskAttemptEventType.TA_KILLED)));
firstMockTaskAttempt.handle(new TaskAttemptEventAttemptKilled(TezTaskAttemptID.fromString(firstMockTaskAttempt.toString()), "test", TaskAttemptTerminationCause.FRAMEWORK_ERROR));
assertEquals("Task should have been killed!", mockTask.getInternalState(), TaskStateInternal.KILLED);
mockTask.handle(createTaskTAAddSpecAttempt(mockTask.getLastAttempt().getID()));
MockTaskAttemptImpl thirdMockTaskAttempt = mockTask.getLastAttempt();
mockTask.handle(createTaskTALauncherEvent(thirdMockTaskAttempt.getID()));
mockTask.handle(createTaskTAAddSpecAttempt(mockTask.getLastAttempt().getID()));
MockTaskAttemptImpl fourthMockTaskAttempt = mockTask.getLastAttempt();
mockTask.handle(createTaskTASucceededEvent(fourthMockTaskAttempt.getID()));
MockTaskAttemptImpl fifthMockTaskAttempt = mockTask.getLastAttempt();
mockTask.handle(createTaskTAFailedEvent(fifthMockTaskAttempt.getID()));
}
use of org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled in project tez by apache.
the class TestTaskCommunicatorManager2 method testTaskAttemptFailedKilled.
@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testTaskAttemptFailedKilled() throws IOException, TezException {
TaskCommunicatorManagerWrapperForTest wrapper = new TaskCommunicatorManagerWrapperForTest();
TaskSpec taskSpec1 = wrapper.createTaskSpec();
AMContainerTask amContainerTask1 = new AMContainerTask(taskSpec1, null, null, false, 10);
TaskSpec taskSpec2 = wrapper.createTaskSpec();
AMContainerTask amContainerTask2 = new AMContainerTask(taskSpec2, null, null, false, 10);
ContainerId containerId1 = wrapper.createContainerId(1);
wrapper.registerRunningContainer(containerId1);
wrapper.registerTaskAttempt(containerId1, amContainerTask1);
ContainerId containerId2 = wrapper.createContainerId(2);
wrapper.registerRunningContainer(containerId2);
wrapper.registerTaskAttempt(containerId2, amContainerTask2);
wrapper.getTaskCommunicatorManager().taskFailed(amContainerTask1.getTask().getTaskAttemptID(), TaskFailureType.NON_FATAL, TaskAttemptEndReason.COMMUNICATION_ERROR, "Diagnostics1");
wrapper.getTaskCommunicatorManager().taskKilled(amContainerTask2.getTask().getTaskAttemptID(), TaskAttemptEndReason.EXECUTOR_BUSY, "Diagnostics2");
ArgumentCaptor<Event> argumentCaptor = ArgumentCaptor.forClass(Event.class);
verify(wrapper.getEventHandler(), times(2)).handle(argumentCaptor.capture());
assertTrue(argumentCaptor.getAllValues().get(0) instanceof TaskAttemptEventAttemptFailed);
assertTrue(argumentCaptor.getAllValues().get(1) instanceof TaskAttemptEventAttemptKilled);
TaskAttemptEventAttemptFailed failedEvent = (TaskAttemptEventAttemptFailed) argumentCaptor.getAllValues().get(0);
TaskAttemptEventAttemptKilled killedEvent = (TaskAttemptEventAttemptKilled) argumentCaptor.getAllValues().get(1);
assertEquals("Diagnostics1", failedEvent.getDiagnosticInfo());
assertEquals(TaskAttemptTerminationCause.COMMUNICATION_ERROR, failedEvent.getTerminationCause());
assertEquals("Diagnostics2", killedEvent.getDiagnosticInfo());
assertEquals(TaskAttemptTerminationCause.SERVICE_BUSY, killedEvent.getTerminationCause());
// TODO TEZ-2003. Verify unregistration from the registered list
}
Aggregations