Search in sources :

Example 1 with TaskAttemptTerminationCause

use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.

the class TaskCommunicatorManager method heartbeat.

public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
    ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
    if (LOG.isDebugEnabled()) {
        LOG.debug("Received heartbeat from container" + ", request=" + request);
    }
    if (!registeredContainers.containsKey(containerId)) {
        LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
        return RESPONSE_SHOULD_DIE;
    }
    // A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
    // meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
    // the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
    // know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
    // So - avoiding synchronization.
    pingContainerHeartbeatHandler(containerId);
    TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
    TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
    if (taskAttemptID != null) {
        ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
        if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
            // This can happen when a task heartbeats. Meanwhile the container is unregistered.
            // The information will eventually make it through to the plugin via a corresponding unregister.
            // There's a race in that case between the unregister making it through, and this method returning.
            // TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
            // so that the plugin can handle the scenario. Alternately augment the response with error codes.
            // Error codes would be better than exceptions.
            LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
            return RESPONSE_SHOULD_DIE;
        }
        List<TezEvent> inEvents = request.getEvents();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
        }
        long currTime = context.getClock().getTime();
        // taFinishedEvents - means the TaskAttemptFinishedEvent
        // taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
        // eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
        // taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
        // taFinishedEvents must be routed before taGeneratedEvents
        List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
        List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
        List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
        TaskAttemptEventStatusUpdate taskAttemptEvent = null;
        boolean readErrorReported = false;
        for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
            // for now, set the event time on the AM when it is received.
            // this avoids any time disparity between machines.
            tezEvent.setEventReceivedTime(currTime);
            final EventType eventType = tezEvent.getEventType();
            if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
                // send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
                taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
            } else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
                taFinishedEvents.add(tezEvent);
            } else {
                if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
                    readErrorReported = true;
                }
                if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
                    taGeneratedEvents.add(tezEvent);
                }
                eventsForVertex.add(tezEvent);
            }
        }
        if (taskAttemptEvent != null) {
            taskAttemptEvent.setReadErrorReported(readErrorReported);
            sendEvent(taskAttemptEvent);
        }
        // route taGeneratedEvents to TaskAttempt
        if (!taGeneratedEvents.isEmpty()) {
            sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
        }
        // route events to TaskAttempt
        Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
        for (TezEvent e : taFinishedEvents) {
            EventMetaData sourceMeta = e.getSourceInfo();
            switch(e.getEventType()) {
                case TASK_ATTEMPT_FAILED_EVENT:
                case TASK_ATTEMPT_KILLED_EVENT:
                    TaskAttemptTerminationCause errCause = null;
                    switch(sourceMeta.getEventGenerator()) {
                        case INPUT:
                            errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
                            break;
                        case PROCESSOR:
                            errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
                            break;
                        case OUTPUT:
                            errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
                            break;
                        case SYSTEM:
                            errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
                            break;
                        default:
                            throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
                    }
                    if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
                        TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
                        sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
                    } else {
                        // Killed
                        TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
                        sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
                    }
                    break;
                case TASK_ATTEMPT_COMPLETED_EVENT:
                    sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
                    break;
                default:
                    throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
            }
        }
        if (!eventsForVertex.isEmpty()) {
            TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
            sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
        }
        taskHeartbeatHandler.pinged(taskAttemptID);
        eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
    }
    return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
Also used : TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TaskAttemptEventStatusUpdate(org.apache.tez.dag.app.dag.event.TaskAttemptEventStatusUpdate) DAGAppMasterEventType(org.apache.tez.dag.app.dag.event.DAGAppMasterEventType) EventType(org.apache.tez.runtime.api.impl.EventType) TaskAttemptEventType(org.apache.tez.dag.app.dag.event.TaskAttemptEventType) ArrayList(java.util.ArrayList) TaskAttemptEvent(org.apache.tez.dag.app.dag.event.TaskAttemptEvent) TaskStatusUpdateEvent(org.apache.tez.runtime.api.events.TaskStatusUpdateEvent) VertexEventRouteEvent(org.apache.tez.dag.app.dag.event.VertexEventRouteEvent) TaskAttemptFailedEvent(org.apache.tez.runtime.api.events.TaskAttemptFailedEvent) TaskAttemptEventTezEventUpdate(org.apache.tez.dag.app.dag.event.TaskAttemptEventTezEventUpdate) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TaskAttemptEventAttemptKilled(org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled) TaskHeartbeatResponse(org.apache.tez.serviceplugins.api.TaskHeartbeatResponse) TezEvent(org.apache.tez.runtime.api.impl.TezEvent) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptKilledEvent(org.apache.tez.runtime.api.events.TaskAttemptKilledEvent) EventMetaData(org.apache.tez.runtime.api.impl.EventMetaData) TezVertexID(org.apache.tez.dag.records.TezVertexID) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID) TaskAttemptEventAttemptFailed(org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptFailed)

Example 2 with TaskAttemptTerminationCause

use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.

the class TaskSchedulerManager method containerCompleted.

public synchronized void containerCompleted(int schedulerId, Object task, ContainerStatus containerStatus) {
    // SchedulerId isn't used here since no node updates are sent out
    // Inform the Containers about completion.
    AMContainer amContainer = appContext.getAllContainers().get(containerStatus.getContainerId());
    if (amContainer != null) {
        String message = "Container completed. ";
        TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.CONTAINER_EXITED;
        int exitStatus = containerStatus.getExitStatus();
        if (exitStatus == ContainerExitStatus.PREEMPTED) {
            message = "Container preempted externally. ";
            errCause = TaskAttemptTerminationCause.EXTERNAL_PREEMPTION;
        } else if (exitStatus == ContainerExitStatus.DISKS_FAILED) {
            message = "Container disk failed. ";
            errCause = TaskAttemptTerminationCause.NODE_DISK_ERROR;
        } else if (exitStatus != ContainerExitStatus.SUCCESS) {
            message = "Container failed, exitCode=" + exitStatus + ". ";
        }
        if (containerStatus.getDiagnostics() != null) {
            message += containerStatus.getDiagnostics();
        }
        sendEvent(new AMContainerEventCompleted(amContainer.getContainerId(), exitStatus, message, errCause));
    }
}
Also used : AMContainerEventCompleted(org.apache.tez.dag.app.rm.container.AMContainerEventCompleted) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) AMContainer(org.apache.tez.dag.app.rm.container.AMContainer) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint)

Example 3 with TaskAttemptTerminationCause

use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.

the class HistoryEventHandler method shouldLogTaskAttemptEvents.

// If the log level is set to TASK_ATTEMPT and filters are configured, then we should suppress
// the start event and publish it only when TaskAttemptFinishedEvent is received after
// matching against the filter.
// Note: if the AM is killed before we get the TaskAttemptFinishedEvent, we'll lose this event.
private boolean shouldLogTaskAttemptEvents(DAGHistoryEvent event, HistoryLogLevel dagLogLevel) {
    HistoryEvent historyEvent = event.getHistoryEvent();
    HistoryEventType eventType = historyEvent.getEventType();
    if (dagLogLevel == HistoryLogLevel.TASK_ATTEMPT && (eventType == HistoryEventType.TASK_ATTEMPT_STARTED || eventType == HistoryEventType.TASK_ATTEMPT_FINISHED)) {
        TezDAGID dagId = event.getDagID();
        Set<TaskAttemptTerminationCause> filters = null;
        if (dagId != null) {
            filters = dagIdToTaskAttemptFilters.get(dagId);
        }
        if (filters == null) {
            filters = amTaskAttemptFilters;
        }
        if (filters == null) {
            return true;
        }
        if (eventType == HistoryEventType.TASK_ATTEMPT_STARTED) {
            suppressedEvents.put(((TaskAttemptStartedEvent) historyEvent).getTaskAttemptID(), event);
            return false;
        } else {
            // TaskAttemptFinishedEvent
            TaskAttemptFinishedEvent finishedEvent = (TaskAttemptFinishedEvent) historyEvent;
            if (filters.contains(finishedEvent.getTaskAttemptError())) {
                suppressedEvents.remove(finishedEvent.getTaskAttemptID());
                return false;
            }
        }
    }
    return true;
}
Also used : TezDAGID(org.apache.tez.dag.records.TezDAGID) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)

Example 4 with TaskAttemptTerminationCause

use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.

the class VertexImpl method tryEnactKill.

/**
 * Set the terminationCause and send a kill-message to all tasks.
 * The task-kill messages are only sent once.
 */
void tryEnactKill(VertexTerminationCause trigger, TaskTerminationCause taskterminationCause) {
    // In most cases the dag is shutting down due to some error
    TaskAttemptTerminationCause errCause = TaskAttemptTerminationCause.TERMINATED_AT_SHUTDOWN;
    if (taskterminationCause == TaskTerminationCause.DAG_KILL) {
        errCause = TaskAttemptTerminationCause.TERMINATED_BY_CLIENT;
    }
    if (trySetTerminationCause(trigger)) {
        String msg = "Killing tasks in vertex: " + logIdentifier + " due to trigger: " + trigger;
        LOG.info(msg);
        for (Task task : tasks.values()) {
            // attempt was terminated because the vertex is shutting down
            eventHandler.handle(new TaskEventTermination(task.getTaskId(), errCause, msg));
        }
    }
}
Also used : TaskEventScheduleTask(org.apache.tez.dag.app.dag.event.TaskEventScheduleTask) Task(org.apache.tez.dag.app.dag.Task) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskEventTermination(org.apache.tez.dag.app.dag.event.TaskEventTermination)

Example 5 with TaskAttemptTerminationCause

use of org.apache.tez.dag.records.TaskAttemptTerminationCause in project tez by apache.

the class TestHistoryEventTimelineConversion method testConvertTaskAttemptFinishedEvent.

@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testConvertTaskAttemptFinishedEvent() {
    String vertexName = "testVertex";
    long creationTime = random.nextLong();
    long startTime = creationTime + 1000;
    long allocationTime = creationTime + 1001;
    long finishTime = startTime + 1002;
    TaskAttemptState state = TaskAttemptState.values()[random.nextInt(TaskAttemptState.values().length)];
    TaskAttemptTerminationCause error = TaskAttemptTerminationCause.values()[random.nextInt(TaskAttemptTerminationCause.values().length)];
    String diagnostics = "random diagnostics message";
    TezCounters counters = new TezCounters();
    long lastDataEventTime = finishTime - 1;
    List<DataEventDependencyInfo> events = Lists.newArrayList();
    events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
    events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
    TaskAttemptFinishedEvent event = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, TaskFailureType.FATAL, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
    List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
    Assert.assertEquals(1, entities.size());
    TimelineEntity timelineEntity = entities.get(0);
    Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getEntityId());
    Assert.assertEquals(EntityTypes.TEZ_TASK_ATTEMPT_ID.name(), timelineEntity.getEntityType());
    final Map<String, Set<Object>> primaryFilters = timelineEntity.getPrimaryFilters();
    Assert.assertEquals(5, primaryFilters.size());
    Assert.assertTrue(primaryFilters.get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_DAG_ID.name()).contains(tezDAGID.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_VERTEX_ID.name()).contains(tezVertexID.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_TASK_ID.name()).contains(tezTaskID.toString()));
    Assert.assertTrue(primaryFilters.get(ATSConstants.STATUS).contains(state.toString()));
    Assert.assertEquals(1, timelineEntity.getEvents().size());
    TimelineEvent evt = timelineEntity.getEvents().get(0);
    Assert.assertEquals(HistoryEventType.TASK_ATTEMPT_FINISHED.name(), evt.getEventType());
    Assert.assertEquals(finishTime, evt.getTimestamp());
    final Map<String, Object> otherInfo = timelineEntity.getOtherInfo();
    Assert.assertEquals(17, otherInfo.size());
    Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getOtherInfo().get(ATSConstants.CREATION_CAUSAL_ATTEMPT));
    Assert.assertEquals(creationTime, timelineEntity.getOtherInfo().get(ATSConstants.CREATION_TIME));
    Assert.assertEquals(allocationTime, timelineEntity.getOtherInfo().get(ATSConstants.ALLOCATION_TIME));
    Assert.assertEquals(startTime, timelineEntity.getOtherInfo().get(ATSConstants.START_TIME));
    Assert.assertEquals(finishTime, otherInfo.get(ATSConstants.FINISH_TIME));
    Assert.assertEquals(finishTime - startTime, otherInfo.get(ATSConstants.TIME_TAKEN));
    Assert.assertEquals(state.name(), otherInfo.get(ATSConstants.STATUS));
    Assert.assertEquals(TaskFailureType.FATAL.name(), otherInfo.get(ATSConstants.TASK_FAILURE_TYPE));
    Assert.assertEquals(error.name(), otherInfo.get(ATSConstants.TASK_ATTEMPT_ERROR_ENUM));
    Assert.assertEquals(diagnostics, otherInfo.get(ATSConstants.DIAGNOSTICS));
    Map<String, Object> obj1 = (Map<String, Object>) otherInfo.get(ATSConstants.LAST_DATA_EVENTS);
    List<Object> obj2 = (List<Object>) obj1.get(ATSConstants.LAST_DATA_EVENTS);
    Assert.assertEquals(2, obj2.size());
    Map<String, Object> obj3 = (Map<String, Object>) obj2.get(0);
    Assert.assertEquals(events.get(0).getTimestamp(), obj3.get(ATSConstants.TIMESTAMP));
    Assert.assertTrue(otherInfo.containsKey(ATSConstants.COUNTERS));
    Assert.assertEquals("inProgressURL", otherInfo.get(ATSConstants.IN_PROGRESS_LOGS_URL));
    Assert.assertEquals("logsURL", otherInfo.get(ATSConstants.COMPLETED_LOGS_URL));
    Assert.assertEquals(nodeId.toString(), otherInfo.get(ATSConstants.NODE_ID));
    Assert.assertEquals(containerId.toString(), otherInfo.get(ATSConstants.CONTAINER_ID));
    Assert.assertEquals("nodeHttpAddress", otherInfo.get(ATSConstants.NODE_HTTP_ADDRESS));
    TaskAttemptFinishedEvent eventWithNullFailureType = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, null, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
    List<TimelineEntity> evtEntities = HistoryEventTimelineConversion.convertToTimelineEntities(eventWithNullFailureType);
    Assert.assertEquals(1, evtEntities.size());
    TimelineEntity timelineEntityWithNullFailureType = evtEntities.get(0);
    Assert.assertNull(timelineEntityWithNullFailureType.getOtherInfo().get(ATSConstants.TASK_FAILURE_TYPE));
}
Also used : TimelineEvent(org.apache.hadoop.yarn.api.records.timeline.TimelineEvent) Set(java.util.Set) TimelineEntity(org.apache.hadoop.yarn.api.records.timeline.TimelineEntity) TezCounters(org.apache.tez.common.counters.TezCounters) DataEventDependencyInfo(org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.DataEventDependencyInfo) TaskAttemptState(org.apache.tez.dag.api.oldrecords.TaskAttemptState) List(java.util.List) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent) Map(java.util.Map) HashMap(java.util.HashMap) Test(org.junit.Test)

Aggregations

TaskAttemptTerminationCause (org.apache.tez.dag.records.TaskAttemptTerminationCause)5 TaskAttemptFinishedEvent (org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)1 TimelineEntity (org.apache.hadoop.yarn.api.records.timeline.TimelineEntity)1 TimelineEvent (org.apache.hadoop.yarn.api.records.timeline.TimelineEvent)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)1 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)1 TaskAttemptState (org.apache.tez.dag.api.oldrecords.TaskAttemptState)1 Task (org.apache.tez.dag.app.dag.Task)1 DAGAppMasterEventType (org.apache.tez.dag.app.dag.event.DAGAppMasterEventType)1 TaskAttemptEvent (org.apache.tez.dag.app.dag.event.TaskAttemptEvent)1 TaskAttemptEventAttemptFailed (org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptFailed)1 TaskAttemptEventAttemptKilled (org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled)1 TaskAttemptEventStatusUpdate (org.apache.tez.dag.app.dag.event.TaskAttemptEventStatusUpdate)1