use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.
the class DAGAppMaster method serviceStart.
@Override
public synchronized void serviceStart() throws Exception {
// start all the components
startServices();
super.serviceStart();
boolean invalidSession = false;
if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
String err = INVALID_SESSION_ERR_MSG;
LOG.error(err);
addDiagnostic(err);
this.state = DAGAppMasterState.ERROR;
invalidSession = true;
}
if (versionMismatch || invalidSession) {
// Short-circuit and return as no DAG should be run
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
this.appsStartTime = clock.getTime();
AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
historyEventHandler.handle(new DAGHistoryEvent(startEvent));
this.lastDAGCompletionTime = clock.getTime();
DAGRecoveryData recoveredDAGData;
try {
recoveredDAGData = recoverDAG();
} catch (IOException e) {
LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
this.state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
if (!isSession) {
LOG.info("In Non-Session mode.");
} else {
LOG.info("In Session mode. Waiting for DAG over RPC");
this.state = DAGAppMasterState.IDLE;
}
if (recoveredDAGData != null) {
if (recoveredDAGData.cumulativeAdditionalResources != null) {
recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
}
if (recoveredDAGData.isSessionStopped) {
LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
this.sessionStopped.set(true);
return;
}
if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
_updateLoggers(recoveredDAGData.recoveredDAG, "");
if (recoveredDAGData.nonRecoverable) {
addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
} else {
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
_updateLoggers(recoveredDAGData.recoveredDAG, "");
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
if (!isSession) {
// No dag recovered - in non-session, just restart the original DAG
dagCounter.set(0);
startDAG();
}
}
if (isSession && sessionTimeoutInterval >= 0) {
this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
try {
checkAndHandleSessionTimeout();
} catch (TezException e) {
LOG.error("Error when checking AM session timeout", e);
}
}
}, sessionTimeoutInterval, sessionTimeoutInterval / 10);
}
// Ignore client heartbeat timeout in local mode or non-session mode
if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
// reset heartbeat time
clientHandler.updateLastHeartbeatTime();
this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {
@Override
public void run() {
try {
long nextExpiry = checkAndHandleDAGClientTimeout();
if (nextExpiry > 0) {
clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
}
} catch (TezException e) {
// Cannot be thrown unless the AM is being tried to shutdown so no need to
// reschedule the timer task
LOG.error("Error when checking Client AM heartbeat timeout", e);
}
}
}, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
}
}
use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testConvertDAGRecoveredEvent2.
@Test(timeout = 5000)
public void testConvertDAGRecoveredEvent2() {
long recoverTime = random.nextLong();
DAGRecoveredEvent event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, recoverTime, DAGState.ERROR, "mock reason", containerLogs);
List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
Assert.assertEquals(1, entities.size());
TimelineEntity timelineEntity = entities.get(0);
Assert.assertEquals(EntityTypes.TEZ_DAG_ID.name(), timelineEntity.getEntityType());
Assert.assertEquals(tezDAGID.toString(), timelineEntity.getEntityId());
Assert.assertEquals(0, timelineEntity.getRelatedEntities().size());
Assert.assertEquals(1, timelineEntity.getEvents().size());
TimelineEvent timelineEvent = timelineEntity.getEvents().get(0);
Assert.assertEquals(HistoryEventType.DAG_RECOVERED.name(), timelineEvent.getEventType());
Assert.assertEquals(recoverTime, timelineEvent.getTimestamp());
Assert.assertTrue(timelineEvent.getEventInfo().containsKey(ATSConstants.APPLICATION_ATTEMPT_ID));
Assert.assertEquals(applicationAttemptId.toString(), timelineEvent.getEventInfo().get(ATSConstants.APPLICATION_ATTEMPT_ID));
Assert.assertEquals(DAGState.ERROR.name(), timelineEvent.getEventInfo().get(ATSConstants.DAG_STATE));
Assert.assertEquals("mock reason", timelineEvent.getEventInfo().get(ATSConstants.RECOVERY_FAILURE_REASON));
Assert.assertEquals(3, timelineEntity.getPrimaryFilters().size());
Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.DAG_NAME).contains("DAGPlanMock"));
Assert.assertTrue(timelineEntity.getPrimaryFilters().get(ATSConstants.USER).contains(user));
Assert.assertEquals(containerLogs, timelineEntity.getOtherInfo().get(ATSConstants.IN_PROGRESS_LOGS_URL + "_" + applicationAttemptId.getAttemptId()));
}
use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
HistoryEventTimelineConversion.convertToTimelineEntities(event);
}
}
use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.
the class ATSV15HistoryLoggingService method isValidEvent.
private boolean isValidEvent(DAGHistoryEvent event) {
HistoryEventType eventType = event.getHistoryEvent().getEventType();
TezDAGID dagId = event.getDagID();
if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
DAGSubmittedEvent dagSubmittedEvent = (DAGSubmittedEvent) event.getHistoryEvent();
String dagName = dagSubmittedEvent.getDAGName();
if ((dagName != null && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) || (!dagSubmittedEvent.isHistoryLoggingEnabled())) {
// Skip recording pre-warm DAG events
skippedDAGs.add(dagId);
return false;
}
}
if (eventType.equals(HistoryEventType.DAG_RECOVERED)) {
DAGRecoveredEvent dagRecoveredEvent = (DAGRecoveredEvent) event.getHistoryEvent();
if (!dagRecoveredEvent.isHistoryLoggingEnabled()) {
skippedDAGs.add(dagRecoveredEvent.getDagID());
return false;
}
}
if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
// No more events should be seen after this point.
if (skippedDAGs.remove(dagId)) {
return false;
}
}
if (dagId != null && skippedDAGs.contains(dagId)) {
// Skip pre-warm DAGs
return false;
}
return true;
}
use of org.apache.tez.dag.history.events.DAGRecoveredEvent in project tez by apache.
the class TestHistoryEventHandler method testLogLevelWithRecovery.
private void testLogLevelWithRecovery(HistoryLogLevel level, int expectedCount) {
HistoryEventHandler handler = createHandler(level);
InMemoryHistoryLoggingService.events.clear();
TezDAGID dagId = TezDAGID.getInstance(appId, 1);
List<DAGHistoryEvent> events = makeHistoryEvents(dagId, handler.getConfig());
events.set(1, new DAGHistoryEvent(dagId, new DAGRecoveredEvent(attemptId, dagId, "test", user, 0, null)));
for (DAGHistoryEvent event : events) {
handler.handle(event);
}
assertEquals("Failed for level: " + level, expectedCount, InMemoryHistoryLoggingService.events.size());
handler.stop();
}
Aggregations