use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.
the class TaskCommunicatorManager method heartbeat.
public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
if (LOG.isDebugEnabled()) {
LOG.debug("Received heartbeat from container" + ", request=" + request);
}
if (!registeredContainers.containsKey(containerId)) {
LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
return RESPONSE_SHOULD_DIE;
}
// A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
// meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
// the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
// know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
// So - avoiding synchronization.
pingContainerHeartbeatHandler(containerId);
TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
if (taskAttemptID != null) {
ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
// This can happen when a task heartbeats. Meanwhile the container is unregistered.
// The information will eventually make it through to the plugin via a corresponding unregister.
// There's a race in that case between the unregister making it through, and this method returning.
// TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
// so that the plugin can handle the scenario. Alternately augment the response with error codes.
// Error codes would be better than exceptions.
LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
return RESPONSE_SHOULD_DIE;
}
List<TezEvent> inEvents = request.getEvents();
if (LOG.isDebugEnabled()) {
LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
}
long currTime = context.getClock().getTime();
// taFinishedEvents - means the TaskAttemptFinishedEvent
// taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
// eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
// taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
// taFinishedEvents must be routed before taGeneratedEvents
List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
TaskAttemptEventStatusUpdate taskAttemptEvent = null;
boolean readErrorReported = false;
for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
// for now, set the event time on the AM when it is received.
// this avoids any time disparity between machines.
tezEvent.setEventReceivedTime(currTime);
final EventType eventType = tezEvent.getEventType();
if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
// send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
} else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
taFinishedEvents.add(tezEvent);
} else {
if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
readErrorReported = true;
}
if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
taGeneratedEvents.add(tezEvent);
}
eventsForVertex.add(tezEvent);
}
}
if (taskAttemptEvent != null) {
taskAttemptEvent.setReadErrorReported(readErrorReported);
sendEvent(taskAttemptEvent);
}
// route taGeneratedEvents to TaskAttempt
if (!taGeneratedEvents.isEmpty()) {
sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
}
// route events to TaskAttempt
Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
for (TezEvent e : taFinishedEvents) {
EventMetaData sourceMeta = e.getSourceInfo();
switch(e.getEventType()) {
case TASK_ATTEMPT_FAILED_EVENT:
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptTerminationCause errCause = null;
switch(sourceMeta.getEventGenerator()) {
case INPUT:
errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
break;
case PROCESSOR:
errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
break;
case OUTPUT:
errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
break;
case SYSTEM:
errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
break;
default:
throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
}
if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
} else {
// Killed
TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
}
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
break;
default:
throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
}
}
if (!eventsForVertex.isEmpty()) {
TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
}
taskHeartbeatHandler.pinged(taskAttemptID);
eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
}
return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.
the class TaskSchedulerManager method containerCompleted.
public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
// SchedulerId isn't used here since no node updates are sent out
// Inform the Containers about completion.
AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
if (amContainer != null) {
String message = "Container completed. ";
TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
int exitStatus = containerStatus.getExitStatus();
if (exitStatus == ContainerExitStatus.PREEMPTED) {
message = "Container preempted externally. ";
errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
} else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
message = "Container disk failed. ";
errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
} else if (exitStatus != ContainerExitStatus.SUCCESS) {
message = "Container failed, exitCode=" + exitStatus + ". ";
}
if (containerStatus.getDiagnostics() != null) {
message += containerStatus.getDiagnostics();
}
sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
}
}
use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.
the class HistoryEventHandler method shouldLogTaskAttemptEvents.
// If the log level is set to TASK_ATTEMPT and filters are configured, then we should suppress
// the start event and publish it only when TaskAttemptFinishedEvent is received after
// matching against the filter.
// Note: if the AM is killed before we get the TaskAttemptFinishedEvent, we'll lose this event.
private boolean shouldLogTaskAttemptEvents(DAGHistoryEvent event, HistoryLogLevel dagLogLevel) {
HistoryEvent historyEvent = event.getHistoryEvent();
HistoryEventType eventType = historyEvent.getEventType();
if (dagLogLevel == HistoryLogLevel.TASK_ATTEMPT && (eventType == HistoryEventType.TASK_ATTEMPT_STARTED || eventType == HistoryEventType.TASK_ATTEMPT_FINISHED)) {
TezDAGID dagId = event.getDagID();
Set<TaskAttemptTerminationCause> filters = null;
if (dagId != null) {
filters = dagIdToTaskAttemptFilters.get(dagId);
}
if (filters == null) {
filters = amTaskAttemptFilters;
}
if (filters == null) {
return true;
}
if (eventType == HistoryEventType.TASK_ATTEMPT_STARTED) {
suppressedEvents.put(((TaskAttemptStartedEvent) historyEvent).getTaskAttemptID(), event);
return false;
} else {
// TaskAttemptFinishedEvent
TaskAttemptFinishedEvent finishedEvent = (TaskAttemptFinishedEvent) historyEvent;
if (filters.contains(finishedEvent.getTaskAttemptError())) {
suppressedEvents.remove(finishedEvent.getTaskAttemptID());
return false;
}
}
}
return true;
}
use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.
the class VertexImpl method tryEnactKill.
/**
* Set the terminationCause and send a kill-message to all tasks.
* The task-kill messages are only sent once.
*/
void tryEnactKill(VertexTerminationCause trigger, TaskTerminationCause taskterminationCause) {
// In most cases the dag is shutting down due to some error
TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.TERMINATED_AT_SHUTDOWN;
if (taskterminationCause == TaskTerminationCause.DAG_KILL) {
errCause = TaskAttemptTerminationCause.TERMINATED_BY_CLIENT;
}
if (trySetTerminationCause(trigger)) {
String msg = "Killing tasks in vertex: " + logIdentifier + " due to trigger: " + trigger;
LOG.info(msg);
for (Task task : tasks.values()) {
// attempt was terminated because the vertex is shutting down
eventHandler.handle(new TaskEventTermination(task.getTaskId(), errCause, msg));
}
}
}
use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.
the class TestHistoryEventTimelineConversion method testConvertTaskAttemptFinishedEvent.
@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testConvertTaskAttemptFinishedEvent() {
String vertexName = "testVertex";
long creationTime = random.nextLong();
long startTime = creationTime + 1000;
long allocationTime = creationTime + 1001;
long finishTime = startTime + 1002;
TaskAttemptState state = TaskAttemptState.values()[random.nextInt(TaskAttemptState.values().length)];
TaskAttemptTerminationCause error = TaskAttemptTerminationCause.values()[random.nextInt(TaskAttemptTerminationCause.values().length)];
String diagnostics = "random diagnostics message";
TezCounters counters = new TezCounters();
long lastDataEventTime = finishTime - 1;
List<DataEventDependencyInfo> events = Lists.newArrayList();
events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
TaskAttemptFinishedEvent event = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, TaskFailureType.FATAL, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
Assert.assertEquals(1, entities.size());
TimelineEntity timelineEntity = entities.get(0);
Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getEntityId());
Assert.assertEquals(EntityTypes.TEZ_TASK_ATTEMPT_ID.name(), timelineEntity.getEntityType());
final Map<String, Set<Object>> primaryFilters = timelineEntity.getPrimaryFilters();
Assert.assertEquals(5, primaryFilters.size());
Assert.assertTrue(primaryFilters.get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_DAG_ID.name()).contains(tezDAGID.toString()));
Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_VERTEX_ID.name()).contains(tezVertexID.toString()));
Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_TASK_ID.name()).contains(tezTaskID.toString()));
Assert.assertTrue(primaryFilters.get(ATSConstants.STATUS).contains(state.toString()));
Assert.assertEquals(1, timelineEntity.getEvents().size());
TimelineEvent evt = timelineEntity.getEvents().get(0);
Assert.assertEquals(HistoryEventType.TASK_ATTEMPT_FINISHED.name(), evt.getEventType());
Assert.assertEquals(finishTime, evt.getTimestamp());
final Map<String, Object> otherInfo = timelineEntity.getOtherInfo();
Assert.assertEquals(17, otherInfo.size());
Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getOtherInfo().get(ATSConstants.CREATION_CAUSAL_ATTEMPT));
Assert.assertEquals(creationTime, timelineEntity.getOtherInfo().get(ATSConstants.CREATION_TIME));
Assert.assertEquals(allocationTime, timelineEntity.getOtherInfo().get(ATSConstants.ALLOCATION_TIME));
Assert.assertEquals(startTime, timelineEntity.getOtherInfo().get(ATSConstants.START_TIME));
Assert.assertEquals(finishTime, otherInfo.get(ATSConstants.FINISH_TIME));
Assert.assertEquals(finishTime - startTime, otherInfo.get(ATSConstants.TIME_TAKEN));
Assert.assertEquals(state.name(), otherInfo.get(ATSConstants.STATUS));
Assert.assertEquals(TaskFailureType.FATAL.name(), otherInfo.get(ATSConstants.TASK_FAILURE_TYPE));
Assert.assertEquals(error.name(), otherInfo.get(ATSConstants.TASK_ATTEMPT_ERROR_ENUM));
Assert.assertEquals(diagnostics, otherInfo.get(ATSConstants.DIAGNOSTICS));
Map<String, Object> obj1 = (Map<String, Object>) otherInfo.get(ATSConstants.LAST_DATA_EVENTS);
List<Object> obj2 = (List<Object>) obj1.get(ATSConstants.LAST_DATA_EVENTS);
Assert.assertEquals(2, obj2.size());
Map<String, Object> obj3 = (Map<String, Object>) obj2.get(0);
Assert.assertEquals(events.get(0).getTimestamp(), obj3.get(ATSConstants.TIMESTAMP));
Assert.assertTrue(otherInfo.containsKey(ATSConstants.COUNTERS));
Assert.assertEquals("inProgressURL", otherInfo.get(ATSConstants.IN_PROGRESS_LOGS_URL));
Assert.assertEquals("logsURL", otherInfo.get(ATSConstants.COMPLETED_LOGS_URL));
Assert.assertEquals(nodeId.toString(), otherInfo.get(ATSConstants.NODE_ID));
Assert.assertEquals(containerId.toString(), otherInfo.get(ATSConstants.CONTAINER_ID));
Assert.assertEquals("nodeHttpAddress", otherInfo.get(ATSConstants.NODE_HTTP_ADDRESS));
TaskAttemptFinishedEvent eventWithNullFailureType = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, null, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
List<TimelineEntity> evtEntities = HistoryEventTimelineConversion.convertToTimelineEntities(eventWithNullFailureType);
Assert.assertEquals(1, evtEntities.size());
TimelineEntity timelineEntityWithNullFailureType = evtEntities.get(0);
Assert.assertNull(timelineEntityWithNullFailureType.getOtherInfo().get(ATSConstants.TASK_FAILURE_TYPE));
}
Aggregations