use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError in project tez by apache.
the class TestMockDAGAppMaster method testSchedulerErrorHandling.
@Test(timeout = 10000)
public void testSchedulerErrorHandling() throws Exception {
TezConfiguration tezconf = new TezConfiguration(defaultConf);
MockTezClient tezClient = new MockTezClient("testMockAM", tezconf, true, null, null, null, null);
tezClient.start();
MockDAGAppMaster mockApp = tezClient.getLocalClient().getMockApp();
MockContainerLauncher mockLauncher = mockApp.getContainerLauncher();
mockLauncher.startScheduling(false);
DAG dag = DAG.create("testSchedulerErrorHandling");
Vertex vA = Vertex.create("A", ProcessorDescriptor.create("Proc.class"), 5);
dag.addVertex(vA);
tezClient.submitDAG(dag);
mockLauncher.waitTillContainersLaunched();
mockApp.handle(new DAGAppMasterEventSchedulingServiceError(org.apache.hadoop.util.StringUtils.stringifyException(new RuntimeException("Mock error"))));
while (!mockApp.getShutdownHandler().wasShutdownInvoked()) {
Thread.sleep(100);
}
Assert.assertEquals(DAGState.RUNNING, mockApp.getContext().getCurrentDAG().getState());
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError in project tez by apache.
the class TaskSchedulerManager method reportError.
public void reportError(int taskSchedulerIndex, ServicePluginError servicePluginError, String diagnostics, DagInfo dagInfo) {
if (servicePluginError == YarnTaskSchedulerServiceError.RESOURCEMANAGER_ERROR) {
LOG.info("Error reported by scheduler {} - {}", Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext) + ": " + diagnostics);
if (taskSchedulerDescriptors[taskSchedulerIndex].getClassName().equals(yarnSchedulerClassName)) {
LOG.warn("Reporting a SchedulerServiceError to the DAGAppMaster since the error" + " was reported by the YARN task scheduler");
sendEvent(new DAGAppMasterEventSchedulingServiceError(diagnostics));
}
} else if (servicePluginError.getErrorType() == ServicePluginError.ErrorType.PERMANENT) {
String msg = "Fatal error reported by TaskScheduler" + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext) + ", servicePluginError=" + servicePluginError + ", diagnostics= " + (diagnostics == null ? "" : diagnostics);
LOG.error(msg);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, null));
} else {
Utils.processNonFatalServiceErrorReport(Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext), servicePluginError, diagnostics, dagInfo, appContext, "TaskScheduler");
}
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventSchedulingServiceError in project tez by apache.
the class DAGAppMaster method handle.
@VisibleForTesting
protected synchronized void handle(DAGAppMasterEvent event) {
String errDiagnostics;
switch(event.getType()) {
case SCHEDULING_SERVICE_ERROR:
// Scheduling error - probably an issue with the communication with the RM
// In this scenario, the AM should shutdown. Expectation is that the RM
// will restart a new AM attempt.
// Should not kill the current running DAG to ensure that on restart, we
// can recover it and continue.
DAGAppMasterEventSchedulingServiceError schedulingServiceErrorEvent = (DAGAppMasterEventSchedulingServiceError) event;
state = DAGAppMasterState.ERROR;
errDiagnostics = "Error in the TaskScheduler. Shutting down. ";
addDiagnostic(errDiagnostics + "Error=" + schedulingServiceErrorEvent.getDiagnosticInfo());
LOG.error(errDiagnostics);
shutdownHandler.shutdown();
break;
case TASK_COMMUNICATOR_SERVICE_FATAL_ERROR:
case CONTAINER_LAUNCHER_SERVICE_FATAL_ERROR:
case TASK_SCHEDULER_SERVICE_FATAL_ERROR:
// A fatal error from the pluggable services. The AM cannot continue operation, and should
// be shutdown. The AM should not be restarted for recovery.
DAGAppMasterEventUserServiceFatalError usfe = (DAGAppMasterEventUserServiceFatalError) event;
Throwable error = usfe.getError();
errDiagnostics = "Service Error: " + usfe.getDiagnosticInfo() + ", eventType=" + event.getType() + ", exception=" + (usfe.getError() == null ? "None" : ExceptionUtils.getStackTrace(usfe.getError()));
LOG.error(errDiagnostics, error);
addDiagnostic(errDiagnostics);
handleInternalError("Service error: " + event.getType(), errDiagnostics);
break;
case INTERNAL_ERROR:
handleInternalError("DAGAppMaster Internal Error occurred", "DAGAppMaster Internal Error occurred");
break;
case DAG_FINISHED:
DAGAppMasterEventDAGFinished finishEvt = (DAGAppMasterEventDAGFinished) event;
String timeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
System.err.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
System.out.println(timeStamp + " Completed Dag: " + finishEvt.getDAGId().toString());
if (!isSession) {
LOG.info("Not a session, AM will unregister as DAG has completed");
this.taskSchedulerManager.setShouldUnregisterFlag();
_updateLoggers(currentDAG, "_post");
setStateOnDAGCompletion();
LOG.info("Shutting down on completion of dag:" + finishEvt.getDAGId().toString());
shutdownHandler.shutdown();
} else {
LOG.info("DAG completed, dagId=" + finishEvt.getDAGId().toString() + ", dagState=" + finishEvt.getDAGState());
lastDAGCompletionTime = clock.getTime();
_updateLoggers(currentDAG, "_post");
if (this.historyEventHandler.hasRecoveryFailed()) {
String recoveryErrorMsg = "Recovery had a fatal error, shutting down session after" + " DAG completion";
LOG.warn(recoveryErrorMsg);
addDiagnostic(recoveryErrorMsg);
sessionStopped.set(true);
}
switch(finishEvt.getDAGState()) {
case SUCCEEDED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
successfulDAGs.incrementAndGet();
}
break;
case FAILED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
failedDAGs.incrementAndGet();
}
break;
case KILLED:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
killedDAGs.incrementAndGet();
}
break;
case ERROR:
if (!currentDAG.getName().startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
failedDAGs.incrementAndGet();
}
// This is a pass-through. Kill the AM if DAG state is ERROR.
default:
LOG.error("Received a DAG Finished Event with state=" + finishEvt.getDAGState() + ". Error. Shutting down.");
addDiagnostic("DAG completed with an ERROR state. Shutting down AM");
state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
break;
}
if (!state.equals(DAGAppMasterState.ERROR)) {
if (!sessionStopped.get()) {
LOG.info("Central Dispatcher queue size after DAG completion, before cleanup: " + dispatcher.getQueueSize());
LOG.info("Waiting for next DAG to be submitted.");
// Sending this via the event queue, in case there are pending events which need to be
// processed. TaskKilled for example, or ContainerCompletions.
// The DAG needs to be part of the event, since the dag can get reset when the next
// dag is submitted. The next DAG, however, will not start executing till the cleanup
// is complete, since execution start is on the same dispatcher.
sendEvent(new DAGAppMasterEventDagCleanup(context.getCurrentDAG()));
// Leaving the taskSchedulerEventHandler here for now. Doesn't generate new events.
// However, eventually it needs to be moved out.
this.taskSchedulerManager.dagCompleted();
} else {
LOG.info("Session shutting down now.");
this.taskSchedulerManager.setShouldUnregisterFlag();
if (this.historyEventHandler.hasRecoveryFailed()) {
state = DAGAppMasterState.FAILED;
} else {
state = DAGAppMasterState.SUCCEEDED;
}
shutdownHandler.shutdown();
}
}
}
// close all fs related caches
try {
FileSystem.closeAllForUGI(context.getCurrentDAG().getDagUGI());
} catch (IOException e) {
LOG.warn("Error occurred when trying to close FileSystem for userName " + context.getCurrentDAG().getDagUGI().getUserName(), e);
}
break;
case AM_REBOOT:
LOG.info("Received an AM_REBOOT signal");
this.state = DAGAppMasterState.KILLED;
shutdownHandler.shutdown(true);
break;
case DAG_CLEANUP:
DAGAppMasterEventDagCleanup cleanupEvent = (DAGAppMasterEventDagCleanup) event;
LOG.info("Cleaning up DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
containerLauncherManager.dagComplete(cleanupEvent.getDag().getID(), jobTokenSecretManager);
taskCommunicatorManager.dagComplete(cleanupEvent.getDag());
nodes.dagComplete(cleanupEvent.getDag());
containers.dagComplete(cleanupEvent.getDag());
TezTaskAttemptID.clearCache();
TezTaskID.clearCache();
TezVertexID.clearCache();
TezDAGID.clearCache();
LOG.info("Completed cleanup for DAG: name=" + cleanupEvent.getDag().getName() + ", with id=" + cleanupEvent.getDag().getID());
synchronized (idleStateLock) {
state = DAGAppMasterState.IDLE;
idleStateLock.notify();
}
break;
case NEW_DAG_SUBMITTED:
// Inform sub-components that a new DAG has been submitted.
taskSchedulerManager.dagSubmitted();
containerLauncherManager.dagSubmitted();
taskCommunicatorManager.dagSubmitted();
break;
default:
throw new TezUncheckedException("AppMaster: No handler for event type: " + event.getType());
}
}
Aggregations