Search in sources :

Example 6 with DAGEventRecoverEvent

use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.

the class TestDAGRecovery method testTaskRecoverFromKilled.

/**
 * RecoveryEvent: TaskFinishedEvent(KILLED)
 * Recover it to KILLED
 */
@Test(timeout = 5000)
public void testTaskRecoverFromKilled() {
    initMockDAGRecoveryDataForTask();
    TaskFinishedEvent taskFinishedEvent = new TaskFinishedEvent(t1v1Id, "v1", 0L, 0L, null, TaskState.KILLED, "", null, 4);
    TaskRecoveryData taskRecoveryData = new TaskRecoveryData(null, taskFinishedEvent, null);
    doReturn(taskRecoveryData).when(dagRecoveryData).getTaskRecoveryData(t1v1Id);
    dag.handle(new DAGEventRecoverEvent(dagId, dagRecoveryData));
    dispatcher.await();
    VertexImpl vertex1 = (VertexImpl) dag.getVertex(v1Id);
    TaskImpl task = (TaskImpl) vertex1.getTask(t1v1Id);
    assertEquals(TaskStateInternal.KILLED, task.getInternalState());
    assertEquals(1, vertex1.getCompletedTasks());
}
Also used : DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) TaskFinishedEvent(org.apache.tez.dag.history.events.TaskFinishedEvent) TaskRecoveryData(org.apache.tez.dag.app.RecoveryParser.TaskRecoveryData) Test(org.junit.Test) StateChangeNotifierForTest(org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)

Example 7 with DAGEventRecoverEvent

use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.

the class TestDAGRecovery method testTARecoverFromRunning.

/**
 * RecoveryEvents: TaskAttemptStartedEvent
 * Recover it to KILLED
 */
@Test(timeout = 5000)
public void testTARecoverFromRunning() {
    initMockDAGRecoveryDataForTaskAttempt();
    TaskAttemptStartedEvent taStartedEvent = new TaskAttemptStartedEvent(ta1t1v1Id, "v1", ta1LaunchTime, mock(ContainerId.class), mock(NodeId.class), "", "", "");
    TaskAttemptRecoveryData taRecoveryData = new TaskAttemptRecoveryData(taStartedEvent, null);
    doReturn(taRecoveryData).when(dagRecoveryData).getTaskAttemptRecoveryData(ta1t1v1Id);
    dag.handle(new DAGEventRecoverEvent(dagId, dagRecoveryData));
    dispatcher.await();
    TaskImpl task = (TaskImpl) dag.getVertex(v1Id).getTask(t1v1Id);
    TaskAttemptImpl taskAttempt = (TaskAttemptImpl) task.getAttempt(ta1t1v1Id);
    assertEquals(TaskAttemptStateInternal.KILLED, taskAttempt.getInternalState());
    assertEquals(TaskAttemptTerminationCause.TERMINATED_AT_RECOVERY, taskAttempt.getTerminationCause());
    historyEventHandler.verifyHistoryEvent(0, HistoryEventType.TASK_ATTEMPT_STARTED);
    historyEventHandler.verifyHistoryEvent(1, HistoryEventType.TASK_ATTEMPT_FINISHED);
    assertEquals(ta1LaunchTime, taskAttempt.getLaunchTime());
}
Also used : DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NodeId(org.apache.hadoop.yarn.api.records.NodeId) TaskAttemptStartedEvent(org.apache.tez.dag.history.events.TaskAttemptStartedEvent) TaskAttemptRecoveryData(org.apache.tez.dag.app.RecoveryParser.TaskAttemptRecoveryData) Test(org.junit.Test) StateChangeNotifierForTest(org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)

Example 8 with DAGEventRecoverEvent

use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.

the class TestDAGRecovery method testVertexRecoverFromNew.

/**
 * RecoveryEvents:
 *  DAG:  DAGInitedEvent -> DAGStartedEvent
 *  V1:   No any event
 *
 * Reinitialize V1 again.
 */
@Test(timeout = 5000)
public void testVertexRecoverFromNew() {
    initMockDAGRecoveryDataForVertex();
    DAGEventRecoverEvent recoveryEvent = new DAGEventRecoverEvent(dagId, dagRecoveryData);
    dag.handle(recoveryEvent);
    dispatcher.await();
    assertEquals(DAGState.RUNNING, dag.getState());
    // reinitialize v1 again
    VertexImpl v1 = (VertexImpl) dag.getVertex("vertex1");
    VertexImpl v2 = (VertexImpl) dag.getVertex("vertex2");
    VertexImpl v3 = (VertexImpl) dag.getVertex("vertex3");
    assertEquals(VertexState.INITIALIZING, v1.getState());
    assertEquals(VertexState.RUNNING, v2.getState());
    assertEquals(VertexState.INITED, v3.getState());
}
Also used : DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) Test(org.junit.Test) StateChangeNotifierForTest(org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)

Example 9 with DAGEventRecoverEvent

use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.

the class DAGAppMaster method serviceStart.

@Override
public synchronized void serviceStart() throws Exception {
    // start all the components
    startServices();
    super.serviceStart();
    boolean invalidSession = false;
    if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
        String err = INVALID_SESSION_ERR_MSG;
        LOG.error(err);
        addDiagnostic(err);
        this.state = DAGAppMasterState.ERROR;
        invalidSession = true;
    }
    if (versionMismatch || invalidSession) {
        // Short-circuit and return as no DAG should be run
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    this.appsStartTime = clock.getTime();
    AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
    historyEventHandler.handle(new DAGHistoryEvent(startEvent));
    this.lastDAGCompletionTime = clock.getTime();
    DAGRecoveryData recoveredDAGData;
    try {
        recoveredDAGData = recoverDAG();
    } catch (IOException e) {
        LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
        this.state = DAGAppMasterState.ERROR;
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    if (!isSession) {
        LOG.info("In Non-Session mode.");
    } else {
        LOG.info("In Session mode. Waiting for DAG over RPC");
        this.state = DAGAppMasterState.IDLE;
    }
    if (recoveredDAGData != null) {
        if (recoveredDAGData.cumulativeAdditionalResources != null) {
            recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
            amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
            cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
        }
        if (recoveredDAGData.isSessionStopped) {
            LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
            this.sessionStopped.set(true);
            return;
        }
        if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
            LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            if (recoveredDAGData.nonRecoverable) {
                addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
                dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            } else {
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            }
        } else {
            LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
            this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
            DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
            dagEventDispatcher.handle(recoverDAGEvent);
            this.state = DAGAppMasterState.RUNNING;
        }
    } else {
        if (!isSession) {
            // No dag recovered - in non-session, just restart the original DAG
            dagCounter.set(0);
            startDAG();
        }
    }
    if (isSession && sessionTimeoutInterval >= 0) {
        this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
        this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {

            @Override
            public void run() {
                try {
                    checkAndHandleSessionTimeout();
                } catch (TezException e) {
                    LOG.error("Error when checking AM session timeout", e);
                }
            }
        }, sessionTimeoutInterval, sessionTimeoutInterval / 10);
    }
    // Ignore client heartbeat timeout in local mode or non-session mode
    if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
        // reset heartbeat time
        clientHandler.updateLastHeartbeatTime();
        this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
        this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {

            @Override
            public void run() {
                try {
                    long nextExpiry = checkAndHandleDAGClientTimeout();
                    if (nextExpiry > 0) {
                        clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
                    }
                } catch (TezException e) {
                    // Cannot be thrown unless the AM is being tried to shutdown so no need to
                    // reschedule the timer task
                    LOG.error("Error when checking Client AM heartbeat timeout", e);
                }
            }
        }, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
    }
}
Also used : AMStartedEvent(org.apache.tez.dag.history.events.AMStartedEvent) TezException(org.apache.tez.dag.api.TezException) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) Timer(java.util.Timer) TimerTask(java.util.TimerTask) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)

Example 10 with DAGEventRecoverEvent

use of org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent in project tez by apache.

the class TestDAGRecovery method testDAGRecoverFromDesiredKilled.

/**
 * RecoveryEvents: SummaryEvent_DAGFinishedEvent(KILLED)
 * Recover dag to KILLED and all of its vertices to KILLED
 */
@Test(timeout = 5000)
public void testDAGRecoverFromDesiredKilled() {
    DAGEventRecoverEvent recoveryEvent = new DAGEventRecoverEvent(dagId, DAGState.KILLED, dagRecoveryData);
    dag.handle(recoveryEvent);
    dispatcher.await();
    assertEquals(DAGState.KILLED, dag.getState());
    assertEquals(3, dag.getVertices().size());
    assertEquals(VertexState.KILLED, dag.getVertex("vertex1").getState());
    assertEquals(VertexState.KILLED, dag.getVertex("vertex2").getState());
    assertEquals(VertexState.KILLED, dag.getVertex("vertex3").getState());
// DAG#initTime, startTime is not guaranteed to be recovered in this case
}
Also used : DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) Test(org.junit.Test) StateChangeNotifierForTest(org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)

Aggregations

DAGEventRecoverEvent (org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent)22 StateChangeNotifierForTest (org.apache.tez.dag.app.dag.TestStateChangeNotifier.StateChangeNotifierForTest)21 Test (org.junit.Test)21 TaskAttemptRecoveryData (org.apache.tez.dag.app.RecoveryParser.TaskAttemptRecoveryData)8 TaskAttemptFinishedEvent (org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)7 ArrayList (java.util.ArrayList)6 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)6 NodeId (org.apache.hadoop.yarn.api.records.NodeId)6 TaskRecoveryData (org.apache.tez.dag.app.RecoveryParser.TaskRecoveryData)6 TaskAttemptStartedEvent (org.apache.tez.dag.history.events.TaskAttemptStartedEvent)6 TezEvent (org.apache.tez.runtime.api.impl.TezEvent)6 VertexRecoveryData (org.apache.tez.dag.app.RecoveryParser.VertexRecoveryData)4 VertexInitializedEvent (org.apache.tez.dag.history.events.VertexInitializedEvent)4 TaskStartedEvent (org.apache.tez.dag.history.events.TaskStartedEvent)3 VertexConfigurationDoneEvent (org.apache.tez.dag.history.events.VertexConfigurationDoneEvent)3 TezTaskID (org.apache.tez.dag.records.TezTaskID)3 EventMetaData (org.apache.tez.runtime.api.impl.EventMetaData)3 HashMap (java.util.HashMap)2 DAGInitializedEvent (org.apache.tez.dag.history.events.DAGInitializedEvent)2 TaskFinishedEvent (org.apache.tez.dag.history.events.TaskFinishedEvent)2