use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.
the class TestDAGRecovery method testTaskRecoverFromKilled.
/**
* RecoveryEvent: TaskFinishedEvent(KILLED)
* Recover it to KILLED
*/
@Test(timeout = 5000)
public void testTaskRecoverFromKilled() {
initMockDAGRecoveryDataForTask();
TaskFinishedEvent taskFinishedEvent = new TaskFinishedEvent(t1v1Id, "v1", 0L, 0L, null, TaskState.KILLED, "", null, 4);
TaskRecoveryData taskRecoveryData = new TaskRecoveryData(null, taskFinishedEvent, null);
doReturn(taskRecoveryData).when(dagRecoveryData).getTaskRecoveryData(t1v1Id);
dag.handle(new DAGEventRecoverEvent(dagId, dagRecoveryData));
dispatcher.await();
VertexImpl vertex1 = (VertexImpl) dag.getVertex(v1Id);
TaskImpl task = (TaskImpl) vertex1.getTask(t1v1Id);
assertEquals(TaskStateInternal.KILLED, task.getInternalState());
assertEquals(1, vertex1.getCompletedTasks());
}
use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.
the class TestDAGRecovery method testTARecoverFromRunning.
/**
* RecoveryEvents: TaskAttemptStartedEvent
* Recover it to KILLED
*/
@Test(timeout = 5000)
public void testTARecoverFromRunning() {
initMockDAGRecoveryDataForTaskAttempt();
TaskAttemptStartedEvent taStartedEvent = new TaskAttemptStartedEvent(ta1t1v1Id, "v1", ta1LaunchTime, mock(ContainerId.class), mock(NodeId.class), "", "", "");
TaskAttemptRecoveryData taRecoveryData = new TaskAttemptRecoveryData(taStartedEvent, null);
doReturn(taRecoveryData).when(dagRecoveryData).getTaskAttemptRecoveryData(ta1t1v1Id);
dag.handle(new DAGEventRecoverEvent(dagId, dagRecoveryData));
dispatcher.await();
TaskImpl task = (TaskImpl) dag.getVertex(v1Id).getTask(t1v1Id);
TaskAttemptImpl taskAttempt = (TaskAttemptImpl) task.getAttempt(ta1t1v1Id);
assertEquals(TaskAttemptStateInternal.KILLED, taskAttempt.getInternalState());
assertEquals(TaskAttemptTerminationCause.TERMINATED_AT_RECOVERY, taskAttempt.getTerminationCause());
historyEventHandler.verifyHistoryEvent(0, HistoryEventType.TASK_ATTEMPT_STARTED);
historyEventHandler.verifyHistoryEvent(1, HistoryEventType.TASK_ATTEMPT_FINISHED);
assertEquals(ta1LaunchTime, taskAttempt.getLaunchTime());
}
use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.
the class TestDAGRecovery method testVertexRecoverFromNew.
/**
* RecoveryEvents:
* DAG: DAGInitedEvent -> DAGStartedEvent
* V1: No any event
*
* Reinitialize V1 again.
*/
@Test(timeout = 5000)
public void testVertexRecoverFromNew() {
initMockDAGRecoveryDataForVertex();
DAGEventRecoverEvent recoveryEvent = new DAGEventRecoverEvent(dagId, dagRecoveryData);
dag.handle(recoveryEvent);
dispatcher.await();
assertEquals(DAGState.RUNNING, dag.getState());
// reinitialize v1 again
VertexImpl v1 = (VertexImpl) dag.getVertex("vertex1");
VertexImpl v2 = (VertexImpl) dag.getVertex("vertex2");
VertexImpl v3 = (VertexImpl) dag.getVertex("vertex3");
assertEquals(VertexState.INITIALIZING, v1.getState());
assertEquals(VertexState.RUNNING, v2.getState());
assertEquals(VertexState.INITED, v3.getState());
}
use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.
the class DAGAppMaster method serviceStart.
@Override
public synchronized void serviceStart() throws Exception {
// start all the components
startServices();
super.serviceStart();
boolean invalidSession = false;
if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
String err = INVALID_SESSION_ERR_MSG;
LOG.error(err);
addDiagnostic(err);
this.state = DAGAppMasterState.ERROR;
invalidSession = true;
}
if (versionMismatch || invalidSession) {
// Short-circuit and return as no DAG should be run
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
this.appsStartTime = clock.getTime();
AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
historyEventHandler.handle(new DAGHistoryEvent(startEvent));
this.lastDAGCompletionTime = clock.getTime();
DAGRecoveryData recoveredDAGData;
try {
recoveredDAGData = recoverDAG();
} catch (IOException e) {
LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
this.state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
if (!isSession) {
LOG.info("In Non-Session mode.");
} else {
LOG.info("In Session mode. Waiting for DAG over RPC");
this.state = DAGAppMasterState.IDLE;
}
if (recoveredDAGData != null) {
if (recoveredDAGData.cumulativeAdditionalResources != null) {
recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
}
if (recoveredDAGData.isSessionStopped) {
LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
this.sessionStopped.set(true);
return;
}
if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
_updateLoggers(recoveredDAGData.recoveredDAG, "");
if (recoveredDAGData.nonRecoverable) {
addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
} else {
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
_updateLoggers(recoveredDAGData.recoveredDAG, "");
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
if (!isSession) {
// No dag recovered - in non-session, just restart the original DAG
dagCounter.set(0);
startDAG();
}
}
if (isSession && sessionTimeoutInterval >= 0) {
this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
try {
checkAndHandleSessionTimeout();
} catch (TezException e) {
LOG.error("Error when checking AM session timeout", e);
}
}
}, sessionTimeoutInterval, sessionTimeoutInterval / 10);
}
// Ignore client heartbeat timeout in local mode or non-session mode
if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
// reset heartbeat time
clientHandler.updateLastHeartbeatTime();
this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {
@Override
public void run() {
try {
long nextExpiry = checkAndHandleDAGClientTimeout();
if (nextExpiry > 0) {
clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
}
} catch (TezException e) {
// Cannot be thrown unless the AM is being tried to shutdown so no need to
// reschedule the timer task
LOG.error("Error when checking Client AM heartbeat timeout", e);
}
}
}, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
}
}
use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.
the class TestDAGRecovery method testDAGRecoverFromDesiredKilled.
/**
* RecoveryEvents: SummaryEvent_DAGFinishedEvent(KILLED)
* Recover dag to KILLED and all of its vertices to KILLED
*/
@Test(timeout = 5000)
public void testDAGRecoverFromDesiredKilled() {
DAGEventRecoverEvent recoveryEvent = new DAGEventRecoverEvent(dagId, DAGState.KILLED, dagRecoveryData);
dag.handle(recoveryEvent);
dispatcher.await();
assertEquals(DAGState.KILLED, dag.getState());
assertEquals(3, dag.getVertices().size());
assertEquals(VertexState.KILLED, dag.getVertex("vertex1").getState());
assertEquals(VertexState.KILLED, dag.getVertex("vertex2").getState());
assertEquals(VertexState.KILLED, dag.getVertex("vertex3").getState());
// DAG#initTime, startTime is not guaranteed to be recovered in this case
}
Aggregations