use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished in project tez by apache.
the class DAGImpl method finished.
private DAGState finished(DAGState finalState) {
if (finishTime == 0) {
setFinishTime();
}
entityUpdateTracker.stop();
boolean recoveryError = false;
// update cpu time counters before finishing the dag
updateCpuCounters();
TezCounters counters = null;
try {
counters = getAllCounters();
} catch (LimitExceededException e) {
addDiagnostic("Counters limit exceeded: " + e.getMessage());
finalState = DAGState.FAILED;
}
try {
if (finalState == DAGState.SUCCEEDED) {
logJobHistoryFinishedEvent(counters);
} else {
logJobHistoryUnsuccesfulEvent(finalState, counters);
}
} catch (IOException e) {
LOG.warn("Failed to persist recovery event for DAG completion" + ", dagId=" + dagId + ", finalState=" + finalState);
recoveryError = true;
}
if (finalState != DAGState.SUCCEEDED) {
abortOutputs();
}
if (recoveryError) {
eventHandler.handle(new DAGAppMasterEventDAGFinished(getID(), DAGState.ERROR));
} else {
eventHandler.handle(new DAGAppMasterEventDAGFinished(getID(), finalState));
}
LOG.info("DAG: " + getID() + " finished with state: " + finalState);
return finalState;
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventDAGFinished in project tez by apache.
the class DAGAppMaster method handle.
@VisibleForTesting
protected synchronized void handle(DAGAppMasterEvent event) {
String errDiagnostics;
switch(event.getType()) {
case SCHEDULING_SERVICE_ERROR:
// Scheduling error - probably an issue with the communication with the RM
// In this scenario, the AM should shutdown. Expectation is that the RM
// will restart a new AM attempt.
// Should not kill the current running DAG to ensure that on restart, we
// can recover it and continue.
DAGAppMasterEventSchedulingServiceError schedulingServiceErrorEvent = (DAGAppMasterEventSchedulingServiceError) event;
state = DAGAppMasterState.ERROR;
errDiagnostics = "Error in the TaskScheduler. Shutting down. ";
addDiagnostic(errDiagnostics + "Error=" + schedulingServiceErrorEvent.getDiagnosticInfo());
LOG.error(errDiagnostics);
shutdownHandler.shutdown();
break;
case TASK_COMMUNICATOR_SERVICE_FATAL_ERROR:
case CONTAINER_LAUNCHER_SERVICE_FATAL_ERROR:
case TASK_SCHEDULER_SERVICE_FATAL_ERROR:
// A fatal error from the pluggable services. The AM cannot continue operation, and should
// be shutdown. The AM should not be restarted for recovery.
DAGAppMasterEventUserServiceFatalError usfe = (DAGAppMasterEventUserServiceFatalError) event;
Throwable error = usfe.getError();
errDiagnostics = "Service Error: " + usfe.getDiagnosticInfo() + ", eventType=" + event.getType() + ", exception=" + (usfe.getError() == null ? "None" : ExceptionUtils.getStackTrace(usfe.getError()));
LOG.error(errDiagnostics, error);
addDiagnostic(errDiagnostics);
handleInternalError("Service error: " + event.getType(), errDiagnostics);
break;
case INTERNAL_ERROR:
handleInternalError("DAGAppMaster Internal Error occurred", "DAGAppMaster Internal Error occurred");
break;
case DAG_FINISHED:
DAGAppMasterEventDAGFinished finishEvt = (DAGAppMasterEventDAGFinished) event;
String timeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
System.err.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
System.out.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
if (!isSession) {
LOG.info("Not a session, AM will unregister as DAG has completed");
this.taskSchedulerManager.setShouldUnregisterFlag();
_updateLoggers(currentDAG, "_post");
setStateOnDAGCompletion();
LOG.info("Shutting down on completion of dag:" + finishEvt.getDAGId().toString());
shutdownHandler.shutdown();
} else {
LOG.info("DAG completed, dagId=" + finishEvt.getDAGId().toString() + ", dagState=" + finishEvt.getDAGState());
lastDAGCompletionTime = clock.getTime();
_updateLoggers(currentDAG, "_post");
if (this.historyEventHandler.hasRecoveryFailed()) {
String recoveryErrorMsg = "Recovery had a fatal error, shutting down session after" + " DAG completion";
LOG.warn(recoveryErrorMsg);
addDiagnostic(recoveryErrorMsg);
sessionStopped.set(true);
}
switch(finishEvt.getDAGState()) {
case SUCCEEDED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
successfulDAGs.incrementAndGet();
}
break;
case FAILED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
failedDAGs.incrementAndGet();
}
break;
case KILLED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
killedDAGs.incrementAndGet();
}
break;
case ERROR:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
failedDAGs.incrementAndGet();
}
// This is a pass-through. Kill the AM if DAG state is ERROR.
default:
LOG.error("Received a DAG Finished Event with state=" + finishEvt.getDAGState() + ". Error. Shutting down.");
addDiagnostic("DAG completed with an ERROR state. Shutting down AM");
state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
break;
}
if (!state.equals(DAGAppMasterState.ERROR)) {
if (!sessionStopped.get()) {
LOG.info("Central Dispatcher queue size after DAG completion, before cleanup: " + dispatcher.getQueueSize());
LOG.info("Waiting for next DAG to be submitted.");
// Sending this via the event queue, in case there are pending events which need to be
// processed. TaskKilled for example, or ContainerCompletions.
// The DAG needs to be part of the event, since the dag can get reset when the next
// dag is submitted. The next DAG, however, will not start executing till the cleanup
// is complete, since execution start is on the same dispatcher.
sendEvent(new DAGAppMasterEventDagCleanup(context.getCurrentDAG()));
// Leaving the taskSchedulerEventHandler here for now. Doesn't generate new events.
// However, eventually it needs to be moved out.
this.taskSchedulerManager.dagCompleted();
} else {
LOG.info("Session shutting down now.");
this.taskSchedulerManager.setShouldUnregisterFlag();
if (this.historyEventHandler.hasRecoveryFailed()) {
state = DAGAppMasterState.FAILED;
} else {
state = DAGAppMasterState.SUCCEEDED;
}
shutdownHandler.shutdown();
}
}
}
// close all fs related caches
try {
FileSystem.closeAllForUGI(context.getCurrentDAG().getDagUGI());
} catch (IOException e) {
LOG.warn("Error occurred when trying to close FileSystem for userName " + context.getCurrentDAG().getDagUGI().getUserName(), e);
}
break;
case AM_REBOOT:
LOG.info("Received an AM_REBOOT signal");
this.state = DAGAppMasterState.KILLED;
shutdownHandler.shutdown(true);
break;
case DAG_CLEANUP:
DAGAppMasterEventDagCleanup cleanupEvent = (DAGAppMasterEventDagCleanup) event;
LOG.info("Cleaning up DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
containerLauncherManager.dagComplete(cleanupEvent.getDag().getID(), jobTokenSecretManager);
taskCommunicatorManager.dagComplete(cleanupEvent.getDag());
nodes.dagComplete(cleanupEvent.getDag());
containers.dagComplete(cleanupEvent.getDag());
TezTaskAttemptID.clearCache();
TezTaskID.clearCache();
TezVertexID.clearCache();
TezDAGID.clearCache();
LOG.info("Completed cleanup for DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
synchronized (idleStateLock) {
state = DAGAppMasterState.IDLE;
idleStateLock.notify();
}
break;
case NEW_DAG_SUBMITTED:
// Inform sub-components that a new DAG has been submitted.
taskSchedulerManager.dagSubmitted();
containerLauncherManager.dagSubmitted();
taskCommunicatorManager.dagSubmitted();
break;
default:
throw new TezUncheckedException("AppMaster: No handler for event type: " + event.getType());
}
}
Aggregations