Search in sources :

Example 1 with DAGRecoveredEvent

use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.

the class DAGAppMaster method serviceStart.

@Override
public synchronized void serviceStart() throws Exception {
    // start all the components
    startServices();
    super.serviceStart();
    boolean invalidSession = false;
    if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
        String err = INVALID_SESSION_ERR_MSG;
        LOG.error(err);
        addDiagnostic(err);
        this.state = DAGAppMasterState.ERROR;
        invalidSession = true;
    }
    if (versionMismatch || invalidSession) {
        // Short-circuit and return as no DAG should be run
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    this.appsStartTime = clock.getTime();
    AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
    historyEventHandler.handle(new DAGHistoryEvent(startEvent));
    this.lastDAGCompletionTime = clock.getTime();
    DAGRecoveryData recoveredDAGData;
    try {
        recoveredDAGData = recoverDAG();
    } catch (IOException e) {
        LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
        this.state = DAGAppMasterState.ERROR;
        this.taskSchedulerManager.setShouldUnregisterFlag();
        shutdownHandler.shutdown();
        return;
    }
    if (!isSession) {
        LOG.info("In Non-Session mode.");
    } else {
        LOG.info("In Session mode. Waiting for DAG over RPC");
        this.state = DAGAppMasterState.IDLE;
    }
    if (recoveredDAGData != null) {
        if (recoveredDAGData.cumulativeAdditionalResources != null) {
            recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
            amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
            cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
        }
        if (recoveredDAGData.isSessionStopped) {
            LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
            this.sessionStopped.set(true);
            return;
        }
        if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
            LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            if (recoveredDAGData.nonRecoverable) {
                addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
                dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            } else {
                DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
                DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
                this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
                dagEventDispatcher.handle(recoverDAGEvent);
                this.state = DAGAppMasterState.RUNNING;
            }
        } else {
            LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
            _updateLoggers(recoveredDAGData.recoveredDAG, "");
            DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
            this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
            DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
            dagEventDispatcher.handle(recoverDAGEvent);
            this.state = DAGAppMasterState.RUNNING;
        }
    } else {
        if (!isSession) {
            // No dag recovered - in non-session, just restart the original DAG
            dagCounter.set(0);
            startDAG();
        }
    }
    if (isSession && sessionTimeoutInterval >= 0) {
        this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
        this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {

            @Override
            public void run() {
                try {
                    checkAndHandleSessionTimeout();
                } catch (TezException e) {
                    LOG.error("Error when checking AM session timeout", e);
                }
            }
        }, sessionTimeoutInterval, sessionTimeoutInterval / 10);
    }
    // Ignore client heartbeat timeout in local mode or non-session mode
    if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
        // reset heartbeat time
        clientHandler.updateLastHeartbeatTime();
        this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
        this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {

            @Override
            public void run() {
                try {
                    long nextExpiry = checkAndHandleDAGClientTimeout();
                    if (nextExpiry > 0) {
                        clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
                    }
                } catch (TezException e) {
                    // Cannot be thrown unless the AM is being tried to shutdown so no need to
                    // reschedule the timer task
                    LOG.error("Error when checking Client AM heartbeat timeout", e);
                }
            }
        }, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
    }
}
Also used : AMStartedEvent(org.apache.tez.dag.history.events.AMStartedEvent) TezException(org.apache.tez.dag.api.TezException) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) DAGEventRecoverEvent(org.apache.tez.dag.app.dag.event.DAGEventRecoverEvent) Timer(java.util.Timer) TimerTask(java.util.TimerTask) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)

Example 2 with DAGRecoveredEvent

use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.

the class TestHistoryEventTimelineConversion method testConvertDAGRecoveredEvent2.

@Test(timeout = 5000)
public void testConvertDAGRecoveredEvent2() {
    long recoverTime = random.nextLong();
    DAGRecoveredEvent event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, recoverTime, DAGState.ERROR, "mock reason", containerLogs);
    List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
    Assert.assertEquals(1, entities.size());
    TimelineEntity timelineEntity = entities.get(0);
    Assert.assertEquals(EntityTypes.TEZ_DAG_ID.name(), timelineEntity.getEntityType());
    Assert.assertEquals(tezDAGID.toString(), timelineEntity.getEntityId());
    Assert.assertEquals(0, timelineEntity.getRelatedEntities().size());
    Assert.assertEquals(1, timelineEntity.getEvents().size());
    TimelineEvent timelineEvent = timelineEntity.getEvents().get(0);
    Assert.assertEquals(HistoryEventType.DAG_RECOVERED.name(), timelineEvent.getEventType());
    Assert.assertEquals(recoverTime, timelineEvent.getTimestamp());
    Assert.assertTrue(timelineEvent.getEventInfo().containsKey(ATSConstants.APPLICATION_ATTEMPT_ID));
    Assert.assertEquals(applicationAttemptId.toString(), timelineEvent.getEventInfo().get(ATSConstants.APPLICATION_ATTEMPT_ID));
    Assert.assertEquals(DAGState.ERROR.name(), timelineEvent.getEventInfo().get(ATSConstants.DAG_STATE));
    Assert.assertEquals("mock reason", timelineEvent.getEventInfo().get(ATSConstants.RECOVERY_FAILURE_REASON));
    Assert.assertEquals(3, timelineEntity.getPrimaryFilters().size());
    Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
    Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.DAG_NAME).contains("DAGPlanMock"));
    Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.USER).contains(user));
    Assert.assertEquals(containerLogs, timelineEntity.getOtherInfo().get(ATSConstants.IN_PROGRESS_LOGS_URL + "_" + applicationAttemptId.getAttemptId()));
}
Also used : TimelineEvent(org.apache.hadoop.yarn.api.records.timeline.TimelineEvent) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) TimelineEntity(org.apache.hadoop.yarn.api.records.timeline.TimelineEntity) Test(org.junit.Test)

Example 3 with DAGRecoveredEvent

use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.

the class TestHistoryEventTimelineConversion method testHandlerExists.

@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
    for (HistoryEventType eventType : HistoryEventType.values()) {
        HistoryEvent event = null;
        switch(eventType) {
            case APP_LAUNCHED:
                event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
                break;
            case AM_LAUNCHED:
                event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
                break;
            case AM_STARTED:
                event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
                break;
            case DAG_SUBMITTED:
                event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
                break;
            case DAG_INITIALIZED:
                event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
                break;
            case DAG_STARTED:
                event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
                break;
            case DAG_FINISHED:
                event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
                break;
            case VERTEX_INITIALIZED:
                event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
                break;
            case VERTEX_STARTED:
                event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
                break;
            case VERTEX_CONFIGURE_DONE:
                event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
                break;
            case VERTEX_FINISHED:
                event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
                break;
            case TASK_STARTED:
                event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
                break;
            case TASK_FINISHED:
                event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
                break;
            case TASK_ATTEMPT_STARTED:
                event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
                break;
            case TASK_ATTEMPT_FINISHED:
                event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
                break;
            case CONTAINER_LAUNCHED:
                event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
                break;
            case CONTAINER_STOPPED:
                event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
                break;
            case DAG_COMMIT_STARTED:
                event = new DAGCommitStartedEvent();
                break;
            case VERTEX_COMMIT_STARTED:
                event = new VertexCommitStartedEvent();
                break;
            case VERTEX_GROUP_COMMIT_STARTED:
                event = new VertexGroupCommitStartedEvent();
                break;
            case VERTEX_GROUP_COMMIT_FINISHED:
                event = new VertexGroupCommitFinishedEvent();
                break;
            case DAG_RECOVERED:
                event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
                break;
            case DAG_KILL_REQUEST:
                event = new DAGKillRequestEvent();
                break;
            default:
                Assert.fail("Unhandled event type " + eventType);
        }
        if (event == null || !event.isHistoryEvent()) {
            continue;
        }
        HistoryEventTimelineConversion.convertToTimelineEntities(event);
    }
}
Also used : DAGCommitStartedEvent(org.apache.tez.dag.history.events.DAGCommitStartedEvent) Configuration(org.apache.hadoop.conf.Configuration) VertexInitializedEvent(org.apache.tez.dag.history.events.VertexInitializedEvent) HistoryEventType(org.apache.tez.dag.history.HistoryEventType) DAGInitializedEvent(org.apache.tez.dag.history.events.DAGInitializedEvent) ContainerStoppedEvent(org.apache.tez.dag.history.events.ContainerStoppedEvent) DAGKillRequestEvent(org.apache.tez.dag.history.events.DAGKillRequestEvent) DAGStartedEvent(org.apache.tez.dag.history.events.DAGStartedEvent) VertexConfigurationDoneEvent(org.apache.tez.dag.history.events.VertexConfigurationDoneEvent) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent) AMStartedEvent(org.apache.tez.dag.history.events.AMStartedEvent) VertexStartedEvent(org.apache.tez.dag.history.events.VertexStartedEvent) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) HistoryEvent(org.apache.tez.dag.history.HistoryEvent) TaskStartedEvent(org.apache.tez.dag.history.events.TaskStartedEvent) TaskAttemptStartedEvent(org.apache.tez.dag.history.events.TaskAttemptStartedEvent) AppLaunchedEvent(org.apache.tez.dag.history.events.AppLaunchedEvent) TaskFinishedEvent(org.apache.tez.dag.history.events.TaskFinishedEvent) VertexGroupCommitFinishedEvent(org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent) AMLaunchedEvent(org.apache.tez.dag.history.events.AMLaunchedEvent) ContainerLaunchedEvent(org.apache.tez.dag.history.events.ContainerLaunchedEvent) DAGFinishedEvent(org.apache.tez.dag.history.events.DAGFinishedEvent) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent) VertexCommitStartedEvent(org.apache.tez.dag.history.events.VertexCommitStartedEvent) Test(org.junit.Test)

Example 4 with DAGRecoveredEvent

use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.

the class ATSV15HistoryLoggingService method isValidEvent.

private boolean isValidEvent(DAGHistoryEvent event) {
    HistoryEventType eventType = event.getHistoryEvent().getEventType();
    TezDAGID dagId = event.getDagID();
    if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
        DAGSubmittedEvent dagSubmittedEvent = (DAGSubmittedEvent) event.getHistoryEvent();
        String dagName = dagSubmittedEvent.getDAGName();
        if ((dagName != null && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) || (!dagSubmittedEvent.isHistoryLoggingEnabled())) {
            // Skip recording pre-warm DAG events
            skippedDAGs.add(dagId);
            return false;
        }
    }
    if (eventType.equals(HistoryEventType.DAG_RECOVERED)) {
        DAGRecoveredEvent dagRecoveredEvent = (DAGRecoveredEvent) event.getHistoryEvent();
        if (!dagRecoveredEvent.isHistoryLoggingEnabled()) {
            skippedDAGs.add(dagRecoveredEvent.getDagID());
            return false;
        }
    }
    if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
        // No more events should be seen after this point.
        if (skippedDAGs.remove(dagId)) {
            return false;
        }
    }
    if (dagId != null && skippedDAGs.contains(dagId)) {
        // Skip pre-warm DAGs
        return false;
    }
    return true;
}
Also used : TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) HistoryEventType(org.apache.tez.dag.history.HistoryEventType) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 5 with DAGRecoveredEvent

use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.

the class TestHistoryEventHandler method testLogLevelWithRecovery.

private void testLogLevelWithRecovery(HistoryLogLevel level, int expectedCount) {
    HistoryEventHandler handler = createHandler(level);
    InMemoryHistoryLoggingService.events.clear();
    TezDAGID dagId = TezDAGID.getInstance(appId, 1);
    List<DAGHistoryEvent> events = makeHistoryEvents(dagId, handler.getConfig());
    events.set(1, new DAGHistoryEvent(dagId, new DAGRecoveredEvent(attemptId, dagId, "test", user, 0, null)));
    for (DAGHistoryEvent event : events) {
        handler.handle(event);
    }
    assertEquals("Failed for level: " + level, expectedCount, InMemoryHistoryLoggingService.events.size());
    handler.stop();
}
Also used : TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent)

Aggregations

DAGRecoveredEvent (org.apache.tez.dag.history.events.DAGRecoveredEvent)8 HistoryEventType (org.apache.tez.dag.history.HistoryEventType)4 DAGSubmittedEvent (org.apache.tez.dag.history.events.DAGSubmittedEvent)4 Test (org.junit.Test)4 AMStartedEvent (org.apache.tez.dag.history.events.AMStartedEvent)3 TezDAGID (org.apache.tez.dag.records.TezDAGID)3 Configuration (org.apache.hadoop.conf.Configuration)2 TimelineEntity (org.apache.hadoop.yarn.api.records.timeline.TimelineEntity)2 TimelineEvent (org.apache.hadoop.yarn.api.records.timeline.TimelineEvent)2 HistoryEvent (org.apache.tez.dag.history.HistoryEvent)2 AMLaunchedEvent (org.apache.tez.dag.history.events.AMLaunchedEvent)2 AppLaunchedEvent (org.apache.tez.dag.history.events.AppLaunchedEvent)2 ContainerLaunchedEvent (org.apache.tez.dag.history.events.ContainerLaunchedEvent)2 ContainerStoppedEvent (org.apache.tez.dag.history.events.ContainerStoppedEvent)2 DAGCommitStartedEvent (org.apache.tez.dag.history.events.DAGCommitStartedEvent)2 DAGFinishedEvent (org.apache.tez.dag.history.events.DAGFinishedEvent)2 DAGInitializedEvent (org.apache.tez.dag.history.events.DAGInitializedEvent)2 DAGKillRequestEvent (org.apache.tez.dag.history.events.DAGKillRequestEvent)2 DAGStartedEvent (org.apache.tez.dag.history.events.DAGStartedEvent)2 TaskAttemptFinishedEvent (org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)2