Search in sources :

Example 1 with DAGAppMasterEventDAGFinished

use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished in project tez by apache.

the class DAGImpl method finished.

private DAGState finished(DAGState finalState) {
    if (finishTime == 0) {
        setFinishTime();
    }
    entityUpdateTracker.stop();
    boolean recoveryError = false;
    // update cpu time counters before finishing the dag
    updateCpuCounters();
    TezCounters counters = null;
    try {
        counters = getAllCounters();
    } catch (LimitExceededException e) {
        addDiagnostic("Counters limit exceeded: " + e.getMessage());
        finalState = DAGState.FAILED;
    }
    try {
        if (finalState == DAGState.SUCCEEDED) {
            logJobHistoryFinishedEvent(counters);
        } else {
            logJobHistoryUnsuccesfulEvent(finalState, counters);
        }
    } catch (IOException e) {
        LOG.warn("Failed to persist recovery event for DAG completion" + ", dagId=" + dagId + ", finalState=" + finalState);
        recoveryError = true;
    }
    if (finalState != DAGState.SUCCEEDED) {
        abortOutputs();
    }
    if (recoveryError) {
        eventHandler.handle(new DAGAppMasterEventDAGFinished(getID(), DAGState.ERROR));
    } else {
        eventHandler.handle(new DAGAppMasterEventDAGFinished(getID(), finalState));
    }
    LOG.info("DAG: " + getID() + " finished with state: " + finalState);
    return finalState;
}
Also used : DAGAppMasterEventDAGFinished(org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished) LimitExceededException(org.apache.tez.common.counters.LimitExceededException) IOException(java.io.IOException) TezCounters(org.apache.tez.common.counters.TezCounters)

Example 2 with DAGAppMasterEventDAGFinished

use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished in project tez by apache.

the class DAGAppMaster method handle.

@VisibleForTesting
protected synchronized void handle(DAGAppMasterEvent event) {
    String errDiagnostics;
    switch(event.getType()) {
        case SCHEDULING_SERVICE_ERROR:
            // Scheduling error - probably an issue with the communication with the RM
            // In this scenario, the AM should shutdown. Expectation is that the RM
            // will restart a new AM attempt.
            // Should not kill the current running DAG to ensure that on restart, we
            // can recover it and continue.
            DAGAppMasterEventSchedulingServiceError schedulingServiceErrorEvent = (DAGAppMasterEventSchedulingServiceError) event;
            state = DAGAppMasterState.ERROR;
            errDiagnostics = "Error in the TaskScheduler. Shutting down. ";
            addDiagnostic(errDiagnostics + "Error=" + schedulingServiceErrorEvent.getDiagnosticInfo());
            LOG.error(errDiagnostics);
            shutdownHandler.shutdown();
            break;
        case TASK_COMMUNICATOR_SERVICE_FATAL_ERROR:
        case CONTAINER_LAUNCHER_SERVICE_FATAL_ERROR:
        case TASK_SCHEDULER_SERVICE_FATAL_ERROR:
            // A fatal error from the pluggable services. The AM cannot continue operation, and should
            // be shutdown. The AM should not be restarted for recovery.
            DAGAppMasterEventUserServiceFatalError usfe = (DAGAppMasterEventUserServiceFatalError) event;
            Throwable error = usfe.getError();
            errDiagnostics = "Service Error: " + usfe.getDiagnosticInfo() + ", eventType=" + event.getType() + ", exception=" + (usfe.getError() == null ? "None" : ExceptionUtils.getStackTrace(usfe.getError()));
            LOG.error(errDiagnostics, error);
            addDiagnostic(errDiagnostics);
            handleInternalError("Service error: " + event.getType(), errDiagnostics);
            break;
        case INTERNAL_ERROR:
            handleInternalError("DAGAppMaster Internal Error occurred", "DAGAppMaster Internal Error occurred");
            break;
        case DAG_FINISHED:
            DAGAppMasterEventDAGFinished finishEvt = (DAGAppMasterEventDAGFinished) event;
            String timeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
            System.err.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
            System.out.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
            if (!isSession) {
                LOG.info("Not a session, AM will unregister as DAG has completed");
                this.taskSchedulerManager.setShouldUnregisterFlag();
                _updateLoggers(currentDAG, "_post");
                setStateOnDAGCompletion();
                LOG.info("Shutting down on completion of dag:" + finishEvt.getDAGId().toString());
                shutdownHandler.shutdown();
            } else {
                LOG.info("DAG completed, dagId=" + finishEvt.getDAGId().toString() + ", dagState=" + finishEvt.getDAGState());
                lastDAGCompletionTime = clock.getTime();
                _updateLoggers(currentDAG, "_post");
                if (this.historyEventHandler.hasRecoveryFailed()) {
                    String recoveryErrorMsg = "Recovery had a fatal error, shutting down session after" + " DAG completion";
                    LOG.warn(recoveryErrorMsg);
                    addDiagnostic(recoveryErrorMsg);
                    sessionStopped.set(true);
                }
                switch(finishEvt.getDAGState()) {
                    case SUCCEEDED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            successfulDAGs.incrementAndGet();
                        }
                        break;
                    case FAILED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            failedDAGs.incrementAndGet();
                        }
                        break;
                    case KILLED:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            killedDAGs.incrementAndGet();
                        }
                        break;
                    case ERROR:
                        if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
                            failedDAGs.incrementAndGet();
                        }
                    // This is a pass-through. Kill the AM if DAG state is ERROR.
                    default:
                        LOG.error("Received a DAG Finished Event with state=" + finishEvt.getDAGState() + ". Error. Shutting down.");
                        addDiagnostic("DAG completed with an ERROR state. Shutting down AM");
                        state = DAGAppMasterState.ERROR;
                        this.taskSchedulerManager.setShouldUnregisterFlag();
                        shutdownHandler.shutdown();
                        break;
                }
                if (!state.equals(DAGAppMasterState.ERROR)) {
                    if (!sessionStopped.get()) {
                        LOG.info("Central Dispatcher queue size after DAG completion, before cleanup: " + dispatcher.getQueueSize());
                        LOG.info("Waiting for next DAG to be submitted.");
                        // Sending this via the event queue, in case there are pending events which need to be
                        // processed. TaskKilled for example, or ContainerCompletions.
                        // The DAG needs to be part of the event, since the dag can get reset when the next
                        // dag is submitted. The next DAG, however, will not start executing till the cleanup
                        // is complete, since execution start is on the same dispatcher.
                        sendEvent(new DAGAppMasterEventDagCleanup(context.getCurrentDAG()));
                        // Leaving the taskSchedulerEventHandler here for now. Doesn't generate new events.
                        // However, eventually it needs to be moved out.
                        this.taskSchedulerManager.dagCompleted();
                    } else {
                        LOG.info("Session shutting down now.");
                        this.taskSchedulerManager.setShouldUnregisterFlag();
                        if (this.historyEventHandler.hasRecoveryFailed()) {
                            state = DAGAppMasterState.FAILED;
                        } else {
                            state = DAGAppMasterState.SUCCEEDED;
                        }
                        shutdownHandler.shutdown();
                    }
                }
            }
            // close all fs related caches
            try {
                FileSystem.closeAllForUGI(context.getCurrentDAG().getDagUGI());
            } catch (IOException e) {
                LOG.warn("Error occurred when trying to close FileSystem for userName " + context.getCurrentDAG().getDagUGI().getUserName(), e);
            }
            break;
        case AM_REBOOT:
            LOG.info("Received an AM_REBOOT signal");
            this.state = DAGAppMasterState.KILLED;
            shutdownHandler.shutdown(true);
            break;
        case DAG_CLEANUP:
            DAGAppMasterEventDagCleanup cleanupEvent = (DAGAppMasterEventDagCleanup) event;
            LOG.info("Cleaning up DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
            containerLauncherManager.dagComplete(cleanupEvent.getDag().getID(), jobTokenSecretManager);
            taskCommunicatorManager.dagComplete(cleanupEvent.getDag());
            nodes.dagComplete(cleanupEvent.getDag());
            containers.dagComplete(cleanupEvent.getDag());
            TezTaskAttemptID.clearCache();
            TezTaskID.clearCache();
            TezVertexID.clearCache();
            TezDAGID.clearCache();
            LOG.info("Completed cleanup for DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
            synchronized (idleStateLock) {
                state = DAGAppMasterState.IDLE;
                idleStateLock.notify();
            }
            break;
        case NEW_DAG_SUBMITTED:
            // Inform sub-components that a new DAG has been submitted.
            taskSchedulerManager.dagSubmitted();
            containerLauncherManager.dagSubmitted();
            taskCommunicatorManager.dagSubmitted();
            break;
        default:
            throw new TezUncheckedException("AppMaster: No handler for event type: " + event.getType());
    }
}
Also used : DAGAppMasterEventUserServiceFatalError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) DAGAppMasterEventDAGFinished(org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished) DAGAppMasterEventDagCleanup(org.apache.tez.dag.app.dag.event.DAGAppMasterEventDagCleanup) IOException(java.io.IOException) DAGAppMasterEventSchedulingServiceError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError) SimpleDateFormat(java.text.SimpleDateFormat) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

IOException (java.io.IOException)2 DAGAppMasterEventDAGFinished (org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 SimpleDateFormat (java.text.SimpleDateFormat)1 LimitExceededException (org.apache.tez.common.counters.LimitExceededException)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)1 DAGAppMasterEventDagCleanup (org.apache.tez.dag.app.dag.event.DAGAppMasterEventDagCleanup)1 DAGAppMasterEventSchedulingServiceError (org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError)1 DAGAppMasterEventUserServiceFatalError (org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError)1