Search in sources :

Example 1 with TezVertexID

use of org.apache.tez.dag.records.TezVertexID in project tez by apache.

the class TaskCommunicatorManager method heartbeat.

public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
    ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
    if (LOG.isDebugEnabled()) {
        LOG.debug("Received heartbeat from container" + ", request=" + request);
    }
    if (!registeredContainers.containsKey(containerId)) {
        LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
        return RESPONSE_SHOULD_DIE;
    }
    // A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
    // meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
    // the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
    // know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
    // So - avoiding synchronization.
    pingContainerHeartbeatHandler(containerId);
    TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
    TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
    if (taskAttemptID != null) {
        ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
        if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
            // This can happen when a task heartbeats. Meanwhile the container is unregistered.
            // The information will eventually make it through to the plugin via a corresponding unregister.
            // There's a race in that case between the unregister making it through, and this method returning.
            // TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
            // so that the plugin can handle the scenario. Alternately augment the response with error codes.
            // Error codes would be better than exceptions.
            LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
            return RESPONSE_SHOULD_DIE;
        }
        List<TezEvent> inEvents = request.getEvents();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
        }
        long currTime = context.getClock().getTime();
        // taFinishedEvents - means the TaskAttemptFinishedEvent
        // taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
        // eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
        // taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
        // taFinishedEvents must be routed before taGeneratedEvents
        List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
        List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
        List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
        TaskAttemptEventStatusUpdate taskAttemptEvent = null;
        boolean readErrorReported = false;
        for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
            // for now, set the event time on the AM when it is received.
            // this avoids any time disparity between machines.
            tezEvent.setEventReceivedTime(currTime);
            final EventType eventType = tezEvent.getEventType();
            if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
                // send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
                taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
            } else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
                taFinishedEvents.add(tezEvent);
            } else {
                if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
                    readErrorReported = true;
                }
                if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
                    taGeneratedEvents.add(tezEvent);
                }
                eventsForVertex.add(tezEvent);
            }
        }
        if (taskAttemptEvent != null) {
            taskAttemptEvent.setReadErrorReported(readErrorReported);
            sendEvent(taskAttemptEvent);
        }
        // route taGeneratedEvents to TaskAttempt
        if (!taGeneratedEvents.isEmpty()) {
            sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
        }
        // route events to TaskAttempt
        Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
        for (TezEvent e : taFinishedEvents) {
            EventMetaData sourceMeta = e.getSourceInfo();
            switch(e.getEventType()) {
                case TASK_ATTEMPT_FAILED_EVENT:
                case TASK_ATTEMPT_KILLED_EVENT:
                    TaskAttemptTerminationCause errCause = null;
                    switch(sourceMeta.getEventGenerator()) {
                        case INPUT:
                            errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
                            break;
                        case PROCESSOR:
                            errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
                            break;
                        case OUTPUT:
                            errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
                            break;
                        case SYSTEM:
                            errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
                            break;
                        default:
                            throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
                    }
                    if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
                        TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
                        sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
                    } else {
                        // Killed
                        TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
                        sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
                    }
                    break;
                case TASK_ATTEMPT_COMPLETED_EVENT:
                    sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
                    break;
                default:
                    throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
            }
        }
        if (!eventsForVertex.isEmpty()) {
            TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
            sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
        }
        taskHeartbeatHandler.pinged(taskAttemptID);
        eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
    }
    return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
Also used : TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TaskAttemptEventStatusUpdate(org.apache.tez.dag.app.dag.event.TaskAttemptEventStatusUpdate) DAGAppMasterEventType(org.apache.tez.dag.app.dag.event.DAGAppMasterEventType) EventType(org.apache.tez.runtime.api.impl.EventType) TaskAttemptEventType(org.apache.tez.dag.app.dag.event.TaskAttemptEventType) ArrayList(java.util.ArrayList) TaskAttemptEvent(org.apache.tez.dag.app.dag.event.TaskAttemptEvent) TaskStatusUpdateEvent(org.apache.tez.runtime.api.events.TaskStatusUpdateEvent) VertexEventRouteEvent(org.apache.tez.dag.app.dag.event.VertexEventRouteEvent) TaskAttemptFailedEvent(org.apache.tez.runtime.api.events.TaskAttemptFailedEvent) TaskAttemptEventTezEventUpdate(org.apache.tez.dag.app.dag.event.TaskAttemptEventTezEventUpdate) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TaskAttemptEventAttemptKilled(org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptKilled) TaskHeartbeatResponse(org.apache.tez.serviceplugins.api.TaskHeartbeatResponse) TezEvent(org.apache.tez.runtime.api.impl.TezEvent) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptKilledEvent(org.apache.tez.runtime.api.events.TaskAttemptKilledEvent) EventMetaData(org.apache.tez.runtime.api.impl.EventMetaData) TezVertexID(org.apache.tez.dag.records.TezVertexID) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID) TaskAttemptEventAttemptFailed(org.apache.tez.dag.app.dag.event.TaskAttemptEventAttemptFailed)

Example 2 with TezVertexID

use of org.apache.tez.dag.records.TezVertexID in project tez by apache.

the class DAGImpl method vertexSucceeded.

private boolean vertexSucceeded(Vertex vertex) {
    numSuccessfulVertices++;
    boolean recoveryFailed = false;
    if (!commitAllOutputsOnSuccess && isCommittable()) {
        // committing successful outputs immediately. check for shared outputs
        List<VertexGroupInfo> groupsList = vertexGroupInfo.get(vertex.getName());
        if (groupsList != null) {
            List<VertexGroupInfo> commitList = Lists.newArrayListWithCapacity(groupsList.size());
            for (VertexGroupInfo groupInfo : groupsList) {
                groupInfo.successfulMembers++;
                if (groupInfo.groupMembers.size() == groupInfo.successfulMembers && !groupInfo.outputs.isEmpty()) {
                    // group has outputs and all vertex members are done
                    LOG.info("All members of group: " + groupInfo.groupName + " are succeeded. Commiting outputs");
                    commitList.add(groupInfo);
                }
            }
            for (VertexGroupInfo groupInfo : commitList) {
                if (recoveryData != null && recoveryData.isVertexGroupCommitted(groupInfo.groupName)) {
                    LOG.info("VertexGroup was already committed as per recovery" + " data, groupName=" + groupInfo.groupName);
                    for (String vertexName : groupInfo.groupMembers) {
                        VertexRecoveryData vertexRecoveryData = recoveryData.getVertexRecoveryData(getVertex(vertexName).getVertexId());
                        Preconditions.checkArgument(vertexRecoveryData != null, "Vertex Group has been committed" + ", but no VertexRecoveryData found for its vertex " + vertexName);
                        VertexFinishedEvent vertexFinishedEvent = vertexRecoveryData.getVertexFinishedEvent();
                        Preconditions.checkArgument(vertexFinishedEvent != null, "Vertex Group has been committed" + ", but no VertexFinishedEvent found in its vertex " + vertexName);
                        Preconditions.checkArgument(vertexFinishedEvent.getState() == VertexState.SUCCEEDED, "Vertex Group has been committed, but unexpected vertex state of its vertex " + vertexName + ", vertexstate=" + vertexFinishedEvent.getState());
                    }
                    continue;
                }
                groupInfo.commitStarted = true;
                final Vertex v = getVertex(groupInfo.groupMembers.iterator().next());
                try {
                    Collection<TezVertexID> vertexIds = getVertexIds(groupInfo.groupMembers);
                    appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitStartedEvent(dagId, groupInfo.groupName, vertexIds, clock.getTime())));
                } catch (IOException e) {
                    LOG.error("Failed to send commit recovery event to handler", e);
                    recoveryFailed = true;
                }
                if (!recoveryFailed) {
                    for (final String outputName : groupInfo.outputs) {
                        OutputKey outputKey = new OutputKey(outputName, groupInfo.groupName, true);
                        CommitCallback groupCommitCallback = new CommitCallback(outputKey);
                        CallableEvent groupCommitCallableEvent = new CallableEvent(groupCommitCallback) {

                            public Void call() throws Exception {
                                OutputCommitter committer = v.getOutputCommitters().get(outputName);
                                LOG.info("Committing output: " + outputName);
                                commitOutput(committer);
                                return null;
                            }
                        };
                        ListenableFuture<Void> groupCommitFuture = appContext.getExecService().submit(groupCommitCallableEvent);
                        Futures.addCallback(groupCommitFuture, groupCommitCallableEvent.getCallback());
                        commitFutures.put(outputKey, groupCommitFuture);
                    }
                }
            }
        }
    }
    if (recoveryFailed) {
        LOG.info("Recovery failure occurred during commit");
        enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.COMMIT_FAILURE);
    }
    return !recoveryFailed;
}
Also used : VertexEventRecoverVertex(org.apache.tez.dag.app.dag.event.VertexEventRecoverVertex) Vertex(org.apache.tez.dag.app.dag.Vertex) OutputCommitter(org.apache.tez.runtime.api.OutputCommitter) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) CallableEvent(org.apache.tez.dag.app.dag.event.CallableEvent) PlanVertexGroupInfo(org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo) VertexRecoveryData(org.apache.tez.dag.app.RecoveryParser.VertexRecoveryData) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) TezVertexID(org.apache.tez.dag.records.TezVertexID)

Example 3 with TezVertexID

use of org.apache.tez.dag.records.TezVertexID in project tez by apache.

the class DAGImpl method commitCompleted.

private boolean commitCompleted(DAGEventCommitCompleted commitCompletedEvent) {
    Preconditions.checkState(commitFutures.remove(commitCompletedEvent.getOutputKey()) != null, "Unknown commit:" + commitCompletedEvent.getOutputKey());
    boolean commitFailed = false;
    boolean recoveryFailed = false;
    if (commitCompletedEvent.isSucceeded()) {
        LOG.info("Commit succeeded for output:" + commitCompletedEvent.getOutputKey());
        OutputKey outputKey = commitCompletedEvent.getOutputKey();
        if (outputKey.isVertexGroupOutput) {
            VertexGroupInfo vertexGroup = vertexGroups.get(outputKey.getEntityName());
            vertexGroup.successfulCommits++;
            if (vertexGroup.isCommitted()) {
                if (!commitAllOutputsOnSuccess) {
                    try {
                        Collection<TezVertexID> vertexIds = getVertexIds(vertexGroup.groupMembers);
                        appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitFinishedEvent(getID(), commitCompletedEvent.getOutputKey().getEntityName(), vertexIds, clock.getTime())));
                    } catch (IOException e) {
                        String diag = "Failed to send commit recovery event to handler, " + ExceptionUtils.getStackTrace(e);
                        addDiagnostic(diag);
                        LOG.error(diag);
                        recoveryFailed = true;
                    }
                }
            }
        }
    } else {
        String diag = "Commit failed for output: " + commitCompletedEvent.getOutputKey() + ", " + ExceptionUtils.getStackTrace(commitCompletedEvent.getException());
        addDiagnostic(diag);
        LOG.error(diag);
        commitFailed = true;
    }
    if (commitFailed) {
        enactKill(DAGTerminationCause.COMMIT_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
        cancelCommits();
    }
    if (recoveryFailed) {
        enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
        cancelCommits();
    }
    return !commitFailed && !recoveryFailed;
}
Also used : VertexGroupCommitFinishedEvent(org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent) PlanVertexGroupInfo(org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) TezVertexID(org.apache.tez.dag.records.TezVertexID)

Example 4 with TezVertexID

use of org.apache.tez.dag.records.TezVertexID in project tez by apache.

the class StateChangeNotifier method unregisterForTaskSuccessUpdates.

public void unregisterForTaskSuccessUpdates(String vertexName, TaskStateUpdateListener listener) {
    TezVertexID vertexId = validateAndGetVertexId(vertexName);
    Preconditions.checkNotNull(listener, "listener cannot be null");
    taskWriteLock.lock();
    try {
        taskListeners.remove(vertexId, listener);
    } finally {
        taskWriteLock.unlock();
    }
}
Also used : TezVertexID(org.apache.tez.dag.records.TezVertexID)

Example 5 with TezVertexID

use of org.apache.tez.dag.records.TezVertexID in project tez by apache.

the class StateChangeNotifier method registerForVertexUpdates.

// -------------- VERTEX STATE CHANGE SECTION ---------------
public void registerForVertexUpdates(String vertexName, Set<org.apache.tez.dag.api.event.VertexState> stateSet, VertexStateUpdateListener listener) {
    TezVertexID vertexId = validateAndGetVertexId(vertexName);
    writeLock.lock();
    // Read within the lock, to ensure a consistent view is seen.
    try {
        List<VertexStateUpdate> previousUpdates = lastKnowStatesMap.get(vertexId);
        ListenerContainer listenerContainer = new ListenerContainer(listener, stateSet);
        Set<ListenerContainer> listenerContainers = vertexListeners.get(vertexId);
        if (listenerContainers == null || !listenerContainers.contains(listenerContainer)) {
            vertexListeners.put(vertexId, listenerContainer);
            // Sent from within the lock to avoid duplicate events, and out of order events.
            if (previousUpdates != null && !previousUpdates.isEmpty()) {
                for (VertexStateUpdate update : previousUpdates) {
                    listenerContainer.sendStateUpdate(update);
                }
            }
        } else {
            // Disallow multiple register calls.
            throw new TezUncheckedException("Only allowed to register once for a listener. CurrentContext: vertexName=" + vertexName + ", Listener: " + listener);
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : VertexStateUpdate(org.apache.tez.dag.api.event.VertexStateUpdate) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TezVertexID(org.apache.tez.dag.records.TezVertexID)

Aggregations

TezVertexID (org.apache.tez.dag.records.TezVertexID)117 Test (org.junit.Test)64 TezDAGID (org.apache.tez.dag.records.TezDAGID)61 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)54 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)46 Configuration (org.apache.hadoop.conf.Configuration)43 TezTaskID (org.apache.tez.dag.records.TezTaskID)43 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)40 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)31 Resource (org.apache.hadoop.yarn.api.records.Resource)29 Container (org.apache.hadoop.yarn.api.records.Container)28 ClusterInfo (org.apache.tez.dag.app.ClusterInfo)28 TaskCommunicatorManagerInterface (org.apache.tez.dag.app.TaskCommunicatorManagerInterface)28 ContainerHeartbeatHandler (org.apache.tez.dag.app.ContainerHeartbeatHandler)27 AMContainerMap (org.apache.tez.dag.app.rm.container.AMContainerMap)27 ContainerContextMatcher (org.apache.tez.dag.app.rm.container.ContainerContextMatcher)27 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)24 TezEvent (org.apache.tez.runtime.api.impl.TezEvent)24 SystemClock (org.apache.hadoop.yarn.util.SystemClock)22 Vertex (org.apache.tez.dag.app.dag.Vertex)22