Search in sources :

Example 6 with DAGRecoveryData

use of org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData in project tez by apache.

the class TestRecoveryParser method testSkipAllOtherEvents_2.

// skipAllOtherEvents due to dag finished
@Test(timeout = 5000)
public void testSkipAllOtherEvents_2() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // write data in attempt_1
    RecoveryService rService = new RecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    rService.handle(new DAGHistoryEvent(dagID, new DAGInitializedEvent(dagID, 1L, "user", dagPlan.getName(), null)));
    rService.handle(new DAGHistoryEvent(dagID, new DAGFinishedEvent(dagID, 1L, 2L, DAGState.FAILED, "diag", null, "user", "dag1", null, appAttemptId, dagPlan)));
    rService.handle(new DAGHistoryEvent(dagID, new DAGStartedEvent(dagID, 1L, "user", "dag1")));
    rService.stop();
    // write data in attempt_2
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/2"));
    rService = new RecoveryService(appContext);
    rService.init(conf);
    rService.start();
    rService.handle(new DAGHistoryEvent(dagID, new DAGStartedEvent(dagID, 1L, "user", "dag1")));
    rService.stop();
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertEquals(false, dagData.nonRecoverable);
    assertEquals(DAGState.FAILED, dagData.dagState);
    assertEquals(true, dagData.isCompleted);
    // DAGSubmittedEvent, DAGInitializedEvent and DAGFinishedEvent is handled
    verify(mockAppMaster).createDAG(any(DAGPlan.class), any(TezDAGID.class));
    // DAGInitializedEvent may not been handled before DAGFinishedEvent,
    // because DAGFinishedEvent's writeToRecoveryImmediately is true
    assertNotNull(dagData.getDAGFinishedEvent());
    assertNull(dagData.getDAGStartedEvent());
}
Also used : Path(org.apache.hadoop.fs.Path) RecoveryService(org.apache.tez.dag.history.recovery.RecoveryService) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Configuration(org.apache.hadoop.conf.Configuration) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) DAGInitializedEvent(org.apache.tez.dag.history.events.DAGInitializedEvent) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGStartedEvent(org.apache.tez.dag.history.events.DAGStartedEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGFinishedEvent(org.apache.tez.dag.history.events.DAGFinishedEvent) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 7 with DAGRecoveryData

use of org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData in project tez by apache.

the class TestRecoveryParser method testLastCorruptedRecoveryRecord.

@Test(timeout = 5000)
public void testLastCorruptedRecoveryRecord() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
    when(appContext.getApplicationID()).thenReturn(appId);
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // write data in attempt_1
    RecoveryService rService = new RecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    // wait until DAGSubmittedEvent is handled in the RecoveryEventHandling thread
    rService.await();
    rService.outputStreamMap.get(dagID).writeUTF("INVALID_DATA");
    rService.stop();
    // write data in attempt_2
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/2"));
    rService = new RecoveryService(appContext);
    rService.init(conf);
    rService.start();
    rService.handle(new DAGHistoryEvent(dagID, new DAGInitializedEvent(dagID, 1L, "user", dagPlan.getName(), null)));
    rService.await();
    rService.outputStreamMap.get(dagID).writeUTF("INVALID_DATA");
    rService.stop();
    // corrupted last records will be skipped but the whole recovery logs will be read
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertEquals(false, dagData.isCompleted);
    assertEquals(null, dagData.reason);
    assertEquals(false, dagData.nonRecoverable);
    // verify DAGSubmitedEvent & DAGInititlizedEvent is handled.
    verify(mockAppMaster).createDAG(any(DAGPlan.class), any(TezDAGID.class));
    assertNotNull(dagData.getDAGInitializedEvent());
}
Also used : Path(org.apache.hadoop.fs.Path) RecoveryService(org.apache.tez.dag.history.recovery.RecoveryService) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Configuration(org.apache.hadoop.conf.Configuration) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) DAGInitializedEvent(org.apache.tez.dag.history.events.DAGInitializedEvent) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) TezDAGID(org.apache.tez.dag.records.TezDAGID) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 8 with DAGRecoveryData

use of org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData in project tez by apache.

the class TestRecoveryParser method testRecoveryData.

@Test(timeout = 5000)
public void testRecoveryData() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    when(mockDAGImpl.getID()).thenReturn(dagID);
    when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
    when(appContext.getApplicationID()).thenReturn(appId);
    RecoveryService rService = new RecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // DAG  DAGSubmittedEvent -> DAGInitializedEvent -> DAGStartedEvent
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    DAGInitializedEvent dagInitedEvent = new DAGInitializedEvent(dagID, 100L, "user", "dagName", null);
    DAGStartedEvent dagStartedEvent = new DAGStartedEvent(dagID, 0L, "user", "dagName");
    rService.handle(new DAGHistoryEvent(dagID, dagInitedEvent));
    rService.handle(new DAGHistoryEvent(dagID, dagStartedEvent));
    // 3 vertices of this dag: v0, v1, v2
    TezVertexID v0Id = TezVertexID.getInstance(dagID, 0);
    TezVertexID v1Id = TezVertexID.getInstance(dagID, 1);
    TezVertexID v2Id = TezVertexID.getInstance(dagID, 2);
    // v0 VertexInitializedEvent
    VertexInitializedEvent v0InitedEvent = new VertexInitializedEvent(v0Id, "v0", 200L, 400L, 2, null, null, null, null);
    rService.handle(new DAGHistoryEvent(dagID, v0InitedEvent));
    // v1 VertexFinishedEvent(KILLED)
    VertexFinishedEvent v1FinishedEvent = new VertexFinishedEvent(v1Id, "v1", 2, 300L, 400L, 500L, 600L, 700L, VertexState.KILLED, "", null, null, null, null);
    rService.handle(new DAGHistoryEvent(dagID, v1FinishedEvent));
    // v2 VertexInitializedEvent -> VertexStartedEvent
    List<TezEvent> initGeneratedEvents = Lists.newArrayList(new TezEvent(DataMovementEvent.create(ByteBuffer.wrap(new byte[0])), null));
    VertexInitializedEvent v2InitedEvent = new VertexInitializedEvent(v2Id, "v2", 200L, 300L, 2, null, null, initGeneratedEvents, null);
    VertexStartedEvent v2StartedEvent = new VertexStartedEvent(v2Id, 0L, 0L);
    rService.handle(new DAGHistoryEvent(dagID, v2InitedEvent));
    rService.handle(new DAGHistoryEvent(dagID, v2StartedEvent));
    // 3 tasks of v2
    TezTaskID t0v2Id = TezTaskID.getInstance(v2Id, 0);
    TezTaskID t1v2Id = TezTaskID.getInstance(v2Id, 1);
    TezTaskID t2v2Id = TezTaskID.getInstance(v2Id, 2);
    // t0v2 TaskStartedEvent
    TaskStartedEvent t0v2StartedEvent = new TaskStartedEvent(t0v2Id, "v2", 400L, 5000L);
    rService.handle(new DAGHistoryEvent(dagID, t0v2StartedEvent));
    // t1v2 TaskFinishedEvent
    TaskFinishedEvent t1v2FinishedEvent = new TaskFinishedEvent(t1v2Id, "v1", 0L, 0L, null, TaskState.KILLED, "", null, 4);
    rService.handle(new DAGHistoryEvent(dagID, t1v2FinishedEvent));
    // t2v2 TaskStartedEvent -> TaskFinishedEvent
    TaskStartedEvent t2v2StartedEvent = new TaskStartedEvent(t2v2Id, "v2", 400L, 500L);
    rService.handle(new DAGHistoryEvent(dagID, t2v2StartedEvent));
    TaskFinishedEvent t2v2FinishedEvent = new TaskFinishedEvent(t2v2Id, "v1", 0L, 0L, null, TaskState.SUCCEEDED, "", null, 4);
    rService.handle(new DAGHistoryEvent(dagID, t2v2FinishedEvent));
    // attempts under t0v2
    ContainerId containerId = ContainerId.newInstance(appAttemptId, 1);
    NodeId nodeId = NodeId.newInstance("localhost", 9999);
    TezTaskAttemptID ta0t0v2Id = TezTaskAttemptID.getInstance(t0v2Id, 0);
    TaskAttemptStartedEvent ta0t0v2StartedEvent = new TaskAttemptStartedEvent(ta0t0v2Id, "v1", 0L, containerId, nodeId, "", "", "");
    rService.handle(new DAGHistoryEvent(dagID, ta0t0v2StartedEvent));
    // attempts under t2v2
    TezTaskAttemptID ta0t2v2Id = TezTaskAttemptID.getInstance(t2v2Id, 0);
    TaskAttemptStartedEvent ta0t2v2StartedEvent = new TaskAttemptStartedEvent(ta0t2v2Id, "v1", 500L, containerId, nodeId, "", "", "");
    rService.handle(new DAGHistoryEvent(dagID, ta0t2v2StartedEvent));
    TaskAttemptFinishedEvent ta0t2v2FinishedEvent = new TaskAttemptFinishedEvent(ta0t2v2Id, "v1", 500L, 600L, TaskAttemptState.SUCCEEDED, null, null, "", null, null, null, 0L, null, 0L, null, null, null, null, null);
    rService.handle(new DAGHistoryEvent(dagID, ta0t2v2FinishedEvent));
    rService.stop();
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertFalse(dagData.nonRecoverable);
    // There's no equals method for the history event, so here only verify the init/start/finish time of each event for simplicity
    assertEquals(dagInitedEvent.getInitTime(), dagData.getDAGInitializedEvent().getInitTime());
    assertEquals(dagStartedEvent.getStartTime(), dagData.getDAGStartedEvent().getStartTime());
    assertNull(dagData.getDAGFinishedEvent());
    VertexRecoveryData v0Data = dagData.getVertexRecoveryData(v0Id);
    VertexRecoveryData v1Data = dagData.getVertexRecoveryData(v1Id);
    VertexRecoveryData v2Data = dagData.getVertexRecoveryData(v2Id);
    assertNotNull(v0Data);
    assertNotNull(v1Data);
    assertNotNull(v2Data);
    assertEquals(v0InitedEvent.getInitedTime(), v0Data.getVertexInitedEvent().getInitedTime());
    assertNull(v0Data.getVertexStartedEvent());
    assertNull(v1Data.getVertexInitedEvent());
    assertEquals(v1FinishedEvent.getFinishTime(), v1Data.getVertexFinishedEvent().getFinishTime());
    assertEquals(v2InitedEvent.getInitedTime(), v2Data.getVertexInitedEvent().getInitedTime());
    assertEquals(v2StartedEvent.getStartTime(), v2Data.getVertexStartedEvent().getStartTime());
    TaskRecoveryData t0v2Data = dagData.getTaskRecoveryData(t0v2Id);
    TaskRecoveryData t1v2Data = dagData.getTaskRecoveryData(t1v2Id);
    TaskRecoveryData t2v2Data = dagData.getTaskRecoveryData(t2v2Id);
    assertNotNull(t0v2Data);
    assertNotNull(t1v2Data);
    assertNotNull(t2v2Data);
    assertEquals(t0v2StartedEvent.getStartTime(), t0v2Data.getTaskStartedEvent().getStartTime());
    assertNull(t0v2Data.getTaskFinishedEvent());
    assertEquals(t1v2FinishedEvent.getFinishTime(), t1v2Data.getTaskFinishedEvent().getFinishTime());
    assertNull(t1v2Data.getTaskStartedEvent());
    assertEquals(t2v2StartedEvent.getStartTime(), t2v2Data.getTaskStartedEvent().getStartTime());
    assertEquals(t2v2FinishedEvent.getFinishTime(), t2v2Data.getTaskFinishedEvent().getFinishTime());
    TaskAttemptRecoveryData ta0t0v2Data = dagData.getTaskAttemptRecoveryData(ta0t0v2Id);
    TaskAttemptRecoveryData ta0t2v2Data = dagData.getTaskAttemptRecoveryData(ta0t2v2Id);
    assertNotNull(ta0t0v2Data);
    assertNotNull(ta0t2v2Data);
    assertEquals(ta0t0v2StartedEvent.getStartTime(), ta0t0v2Data.getTaskAttemptStartedEvent().getStartTime());
    assertNull(ta0t0v2Data.getTaskAttemptFinishedEvent());
    assertEquals(ta0t2v2StartedEvent.getStartTime(), ta0t2v2Data.getTaskAttemptStartedEvent().getStartTime());
    assertEquals(ta0t2v2FinishedEvent.getFinishTime(), ta0t2v2Data.getTaskAttemptFinishedEvent().getFinishTime());
}
Also used : RecoveryService(org.apache.tez.dag.history.recovery.RecoveryService) Configuration(org.apache.hadoop.conf.Configuration) VertexInitializedEvent(org.apache.tez.dag.history.events.VertexInitializedEvent) DAGInitializedEvent(org.apache.tez.dag.history.events.DAGInitializedEvent) TaskAttemptRecoveryData(org.apache.tez.dag.app.RecoveryParser.TaskAttemptRecoveryData) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGStartedEvent(org.apache.tez.dag.history.events.DAGStartedEvent) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent) TezVertexID(org.apache.tez.dag.records.TezVertexID) Path(org.apache.hadoop.fs.Path) VertexStartedEvent(org.apache.tez.dag.history.events.VertexStartedEvent) SystemClock(org.apache.hadoop.yarn.util.SystemClock) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) TaskStartedEvent(org.apache.tez.dag.history.events.TaskStartedEvent) TezTaskID(org.apache.tez.dag.records.TezTaskID) TaskAttemptStartedEvent(org.apache.tez.dag.history.events.TaskAttemptStartedEvent) TaskRecoveryData(org.apache.tez.dag.app.RecoveryParser.TaskRecoveryData) TaskFinishedEvent(org.apache.tez.dag.history.events.TaskFinishedEvent) NodeId(org.apache.hadoop.yarn.api.records.NodeId) VertexRecoveryData(org.apache.tez.dag.app.RecoveryParser.VertexRecoveryData) TezEvent(org.apache.tez.runtime.api.impl.TezEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID)

Example 9 with DAGRecoveryData

use of org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData in project tez by apache.

the class DAGAppMaster method recoverDAG.

private DAGRecoveryData recoverDAG() throws IOException, TezException {
    if (recoveryEnabled) {
        try {
            TezUtilsInternal.setHadoopCallerContext(hadoopShim, this.getAppID());
            if (this.appAttemptID.getAttemptId() > 1) {
                LOG.info("Recovering data from previous attempts" + ", currentAttemptId=" + this.appAttemptID.getAttemptId());
                this.state = DAGAppMasterState.RECOVERING;
                RecoveryParser recoveryParser = new RecoveryParser(this, recoveryFS, recoveryDataDir, appAttemptID.getAttemptId());
                DAGRecoveryData recoveredDAGData = recoveryParser.parseRecoveryData();
                return recoveredDAGData;
            }
        } finally {
            hadoopShim.clearHadoopCallerContext();
        }
    }
    return null;
}
Also used : DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)

Example 10 with DAGRecoveryData

use of org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData in project tez by apache.

the class DAGAppMaster method serviceStart.

@Override
public synchronized void serviceStart() throws Exception {
    // start all the components
    startServices();
    super.serviceStart();
    boolean invalidSession = false;
    if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
        String err = INVALID_SESSION_ERR_MSG;
        LOG.error(err);
        addDiagnostic(err);
        this.state = DAGAppMasterState.ERROR;
        invalidSession = true;
    }
    if (versionMismatch || invalidSession) {
        // Short-circuit and return as no DAG should be run
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    this.appsStartTime = clock.getTime();
    AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
    historyEventHandler.handle(new DAGHistoryEvent(startEvent));
    this.lastDAGCompletionTime = clock.getTime();
    DAGRecoveryData recoveredDAGData;
    try {
        recoveredDAGData = recoverDAG();
    } catch (IOException e) {
        LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
        this.state = DAGAppMasterState.ERROR;
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    if (!isSession) {
        LOG.info("In Non-Session mode.");
    } else {
        LOG.info("In Session mode. Waiting for DAG over RPC");
        this.state = DAGAppMasterState.IDLE;
    }
    if (recoveredDAGData != null) {
        if (recoveredDAGData.cumulativeAdditionalResources != null) {
            recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
            amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
            cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
        }
        if (recoveredDAGData.isSessionStopped) {
            LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
            this.sessionStopped.set(true);
            return;
        }
        if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
            LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            if (recoveredDAGData.nonRecoverable) {
                addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
                dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            } else {
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            }
        } else {
            LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
            this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
            DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
            dagEventDispatcher.handle(recoverDAGEvent);
            this.state = DAGAppMasterState.RUNNING;
        }
    } else {
        if (!isSession) {
            // No dag recovered - in non-session, just restart the original DAG
            dagCounter.set(0);
            startDAG();
        }
    }
    if (isSession && sessionTimeoutInterval >= 0) {
        this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
        this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {

            @Override
            public void run() {
                try {
                    checkAndHandleSessionTimeout();
                } catch (TezException e) {
                    LOG.error("Error when checking AM session timeout", e);
                }
            }
        }, sessionTimeoutInterval, sessionTimeoutInterval / 10);
    }
    // Ignore client heartbeat timeout in local mode or non-session mode
    if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
        // reset heartbeat time
        clientHandler.updateLastHeartbeatTime();
        this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
        this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {

            @Override
            public void run() {
                try {
                    long nextExpiry = checkAndHandleDAGClientTimeout();
                    if (nextExpiry > 0) {
                        clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
                    }
                } catch (TezException e) {
                    // Cannot be thrown unless the AM is being tried to shutdown so no need to
                    // reschedule the timer task
                    LOG.error("Error when checking Client AM heartbeat timeout", e);
                }
            }
        }, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
    }
}
Also used : AMStartedEvent(org.apache.tez.dag.history.events.AMStartedEvent) TezException(org.apache.tez.dag.api.TezException) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) Timer(java.util.Timer) TimerTask(java.util.TimerTask) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)

Aggregations

DAGRecoveryData (org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)14 DAGHistoryEvent (org.apache.tez.dag.history.DAGHistoryEvent)13 Configuration (org.apache.hadoop.conf.Configuration)12 Path (org.apache.hadoop.fs.Path)12 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)12 SystemClock (org.apache.hadoop.yarn.util.SystemClock)12 DAGPlan (org.apache.tez.dag.api.records.DAGProtos.DAGPlan)12 DAGSubmittedEvent (org.apache.tez.dag.history.events.DAGSubmittedEvent)12 TezDAGID (org.apache.tez.dag.records.TezDAGID)12 RecoveryService (org.apache.tez.dag.history.recovery.RecoveryService)10 DefaultHadoopShim (org.apache.tez.hadoop.shim.DefaultHadoopShim)6 TezVertexID (org.apache.tez.dag.records.TezVertexID)5 DAGInitializedEvent (org.apache.tez.dag.history.events.DAGInitializedEvent)4 VertexFinishedEvent (org.apache.tez.dag.history.events.VertexFinishedEvent)4 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)3 DAGCommitStartedEvent (org.apache.tez.dag.history.events.DAGCommitStartedEvent)3 DAGStartedEvent (org.apache.tez.dag.history.events.DAGStartedEvent)3 VertexCommitStartedEvent (org.apache.tez.dag.history.events.VertexCommitStartedEvent)3 VertexGroupCommitStartedEvent (org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent)3 DAGFinishedEvent (org.apache.tez.dag.history.events.DAGFinishedEvent)2