Search in sources :

Example 21 with DAGAppMasterEventUserServiceFatalError

use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.

the class DAGAppMaster method handle.

@VisibleForTesting
protected synchronized void handle(DAGAppMasterEvent event) {
    String errDiagnostics;
    switch(event.getType()) {
        case SCHEDULING_SERVICE_ERROR:
            // Scheduling error - probably an issue with the communication with the RM
            // In this scenario, the AM should shutdown. Expectation is that the RM
            // will restart a new AM attempt.
            // Should not kill the current running DAG to ensure that on restart, we
            // can recover it and continue.
            DAGAppMasterEventSchedulingServiceError schedulingServiceErrorEvent = (DAGAppMasterEventSchedulingServiceError) event;
            state = DAGAppMasterState.ERROR;
            errDiagnostics = "Error in the TaskScheduler. Shutting down. ";
            addDiagnostic(errDiagnostics + "Error=" + schedulingServiceErrorEvent.getDiagnosticInfo());
            LOG.error(errDiagnostics);
            shutdownHandler.shutdown();
            break;
        case TASK_COMMUNICATOR_SERVICE_FATAL_ERROR:
        case CONTAINER_LAUNCHER_SERVICE_FATAL_ERROR:
        case TASK_SCHEDULER_SERVICE_FATAL_ERROR:
            // A fatal error from the pluggable services. The AM cannot continue operation, and should
            // be shutdown. The AM should not be restarted for recovery.
            DAGAppMasterEventUserServiceFatalError usfe = (DAGAppMasterEventUserServiceFatalError) event;
            Throwable error = usfe.getError();
            errDiagnostics = "Service Error: " + usfe.getDiagnosticInfo() + ", eventType=" + event.getType() + ", exception=" + (usfe.getError() == null ? "None" : ExceptionUtils.getStackTrace(usfe.getError()));
            LOG.error(errDiagnostics, error);
            addDiagnostic(errDiagnostics);
            handleInternalError("Service error: " + event.getType(), errDiagnostics);
            break;
        case INTERNAL_ERROR:
            handleInternalError("DAGAppMaster Internal Error occurred", "DAGAppMaster Internal Error occurred");
            break;
        case DAG_FINISHED:
            DAGAppMasterEventDAGFinished finishEvt = (DAGAppMasterEventDAGFinished) event;
            String timeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
            System.err.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
            System.out.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
            if (!isSession) {
                LOG.info("Not a session, AM will unregister as DAG has completed");
                this.taskSchedulerManager.setShouldUnregisterFlag();
                _updateLoggers(currentDAG, "_post");
                setStateOnDAGCompletion();
                LOG.info("Shutting down on completion of dag:" + finishEvt.getDAGId().toString());
                shutdownHandler.shutdown();
            } else {
                LOG.info("DAG completed, dagId=" + finishEvt.getDAGId().toString() + ", dagState=" + finishEvt.getDAGState());
                lastDAGCompletionTime = clock.getTime();
                _updateLoggers(currentDAG, "_post");
                if (this.historyEventHandler.hasRecoveryFailed()) {
                    String recoveryErrorMsg = "Recovery had a fatal error, shutting down session after" + " DAG completion";
                    LOG.warn(recoveryErrorMsg);
                    addDiagnostic(recoveryErrorMsg);
                    sessionStopped.set(true);
                }
                switch(finishEvt.getDAGState()) {
                    case SUCCEEDED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            successfulDAGs.incrementAndGet();
                        }
                        break;
                    case FAILED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            failedDAGs.incrementAndGet();
                        }
                        break;
                    case KILLED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            killedDAGs.incrementAndGet();
                        }
                        break;
                    case ERROR:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            failedDAGs.incrementAndGet();
                        }
                    // This is a pass-through. Kill the AM if DAG state is ERROR.
                    default:
                        LOG.error("Received a DAG Finished Event with state=" + finishEvt.getDAGState() + ". Error. Shutting down.");
                        addDiagnostic("DAG completed with an ERROR state. Shutting down AM");
                        state = DAGAppMasterState.ERROR;
                        this.taskSchedulerManager.setShouldUnregisterFlag();
                        shutdownHandler.shutdown();
                        break;
                }
                if (!state.equals(DAGAppMasterState.ERROR)) {
                    if (!sessionStopped.get()) {
                        LOG.info("Central Dispatcher queue size after DAG completion, before cleanup: " + dispatcher.getQueueSize());
                        LOG.info("Waiting for next DAG to be submitted.");
                        // Sending this via the event queue, in case there are pending events which need to be
                        // processed. TaskKilled for example, or ContainerCompletions.
                        // The DAG needs to be part of the event, since the dag can get reset when the next
                        // dag is submitted. The next DAG, however, will not start executing till the cleanup
                        // is complete, since execution start is on the same dispatcher.
                        sendEvent(new DAGAppMasterEventDagCleanup(context.getCurrentDAG()));
                        // Leaving the taskSchedulerEventHandler here for now. Doesn't generate new events.
                        // However, eventually it needs to be moved out.
                        this.taskSchedulerManager.dagCompleted();
                    } else {
                        LOG.info("Session shutting down now.");
                        this.taskSchedulerManager.setShouldUnregisterFlag();
                        if (this.historyEventHandler.hasRecoveryFailed()) {
                            state = DAGAppMasterState.FAILED;
                        } else {
                            state = DAGAppMasterState.SUCCEEDED;
                        }
                        shutdownHandler.shutdown();
                    }
                }
            }
            // close all fs related caches
            try {
                FileSystem.closeAllForUGI(context.getCurrentDAG().getDagUGI());
            } catch (IOException e) {
                LOG.warn("Error occurred when trying to close FileSystem for userName " + context.getCurrentDAG().getDagUGI().getUserName(), e);
            }
            break;
        case AM_REBOOT:
            LOG.info("Received an AM_REBOOT signal");
            this.state = DAGAppMasterState.KILLED;
            shutdownHandler.shutdown(true);
            break;
        case DAG_CLEANUP:
            DAGAppMasterEventDagCleanup cleanupEvent = (DAGAppMasterEventDagCleanup) event;
            LOG.info("Cleaning up DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
            containerLauncherManager.dagComplete(cleanupEvent.getDag().getID(), jobTokenSecretManager);
            taskCommunicatorManager.dagComplete(cleanupEvent.getDag());
            nodes.dagComplete(cleanupEvent.getDag());
            containers.dagComplete(cleanupEvent.getDag());
            TezTaskAttemptID.clearCache();
            TezTaskID.clearCache();
            TezVertexID.clearCache();
            TezDAGID.clearCache();
            LOG.info("Completed cleanup for DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
            synchronized (idleStateLock) {
                state = DAGAppMasterState.IDLE;
                idleStateLock.notify();
            }
            break;
        case NEW_DAG_SUBMITTED:
            // Inform sub-components that a new DAG has been submitted.
            taskSchedulerManager.dagSubmitted();
            containerLauncherManager.dagSubmitted();
            taskCommunicatorManager.dagSubmitted();
            break;
        default:
            throw new TezUncheckedException("AppMaster: No handler for event type: " + event.getType());
    }
}
Also used : DAGAppMasterEventUserServiceFatalError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) DAGAppMasterEventDAGFinished(org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished) DAGAppMasterEventDagCleanup(org.apache.tez.dag.app.dag.event.DAGAppMasterEventDagCleanup) IOException(java.io.IOException) DAGAppMasterEventSchedulingServiceError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError) SimpleDateFormat(java.text.SimpleDateFormat) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

DAGAppMasterEventUserServiceFatalError (org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError)21 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)12 TezException (org.apache.tez.dag.api.TezException)11 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)9 Event (org.apache.hadoop.yarn.event.Event)6 EventHandler (org.apache.hadoop.yarn.event.EventHandler)6 DagInfoImplForTest (org.apache.tez.dag.helpers.DagInfoImplForTest)6 Test (org.junit.Test)6 Configuration (org.apache.hadoop.conf.Configuration)5 IOException (java.io.IOException)4 NodeId (org.apache.hadoop.yarn.api.records.NodeId)4 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)4 AppContext (org.apache.tez.dag.app.AppContext)4 DAG (org.apache.tez.dag.app.dag.DAG)4 TaskAttempt (org.apache.tez.dag.app.dag.TaskAttempt)4 InvocationTargetException (java.lang.reflect.InvocationTargetException)3 LinkedList (java.util.LinkedList)3 NamedEntityDescriptor (org.apache.tez.dag.api.NamedEntityDescriptor)3 DAGEventTerminateDag (org.apache.tez.dag.app.dag.event.DAGEventTerminateDag)3 ContainerLauncherLaunchRequestEvent (org.apache.tez.dag.app.rm.ContainerLauncherLaunchRequestEvent)3