use of org.apache.tez.dag.history.events.AMStartedEvent in project tez by apache.
the class DAGAppMaster method serviceStart.
@Override
public synchronized void serviceStart() throws Exception {
// start all the components
startServices();
super.serviceStart();
boolean invalidSession = false;
if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
String err = INVALID_SESSION_ERR_MSG;
LOG.error(err);
addDiagnostic(err);
this.state = DAGAppMasterState.ERROR;
invalidSession = true;
}
if (versionMismatch || invalidSession) {
// Short-circuit and return as no DAG should be run
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
this.appsStartTime = clock.getTime();
AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
historyEventHandler.handle(new DAGHistoryEvent(startEvent));
this.lastDAGCompletionTime = clock.getTime();
DAGRecoveryData recoveredDAGData;
try {
recoveredDAGData = recoverDAG();
} catch (IOException e) {
LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
this.state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
if (!isSession) {
LOG.info("In Non-Session mode.");
} else {
LOG.info("In Session mode. Waiting for DAG over RPC");
this.state = DAGAppMasterState.IDLE;
}
if (recoveredDAGData != null) {
if (recoveredDAGData.cumulativeAdditionalResources != null) {
recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
}
if (recoveredDAGData.isSessionStopped) {
LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
this.sessionStopped.set(true);
return;
}
if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
_updateLoggers(recoveredDAGData.recoveredDAG, "");
if (recoveredDAGData.nonRecoverable) {
addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
} else {
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
_updateLoggers(recoveredDAGData.recoveredDAG, "");
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
if (!isSession) {
// No dag recovered - in non-session, just restart the original DAG
dagCounter.set(0);
startDAG();
}
}
if (isSession && sessionTimeoutInterval >= 0) {
this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
try {
checkAndHandleSessionTimeout();
} catch (TezException e) {
LOG.error("Error when checking AM session timeout", e);
}
}
}, sessionTimeoutInterval, sessionTimeoutInterval / 10);
}
// Ignore client heartbeat timeout in local mode or non-session mode
if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
// reset heartbeat time
clientHandler.updateLastHeartbeatTime();
this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {
@Override
public void run() {
try {
long nextExpiry = checkAndHandleDAGClientTimeout();
if (nextExpiry > 0) {
clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
}
} catch (TezException e) {
// Cannot be thrown unless the AM is being tried to shutdown so no need to
// reschedule the timer task
LOG.error("Error when checking Client AM heartbeat timeout", e);
}
}
}, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
}
}
use of org.apache.tez.dag.history.events.AMStartedEvent in project tez by apache.
the class TestATSHistoryLoggingService method makeHistoryEvents.
private List<DAGHistoryEvent> makeHistoryEvents(TezDAGID dagId, ATSHistoryLoggingService service) {
List<DAGHistoryEvent> historyEvents = new ArrayList<>();
long time = System.currentTimeMillis();
Configuration conf = new Configuration(service.getConfig());
historyEvents.add(new DAGHistoryEvent(null, new AMStartedEvent(attemptId, time, "user")));
historyEvents.add(new DAGHistoryEvent(dagId, new DAGSubmittedEvent(dagId, time, DAGPlan.getDefaultInstance(), attemptId, null, "user", conf, null, "default")));
TezVertexID vertexID = TezVertexID.getInstance(dagId, 1);
historyEvents.add(new DAGHistoryEvent(dagId, new VertexStartedEvent(vertexID, time, time)));
TezTaskID tezTaskID = TezTaskID.getInstance(vertexID, 1);
historyEvents.add(new DAGHistoryEvent(dagId, new TaskStartedEvent(tezTaskID, "test", time, time)));
historyEvents.add(new DAGHistoryEvent(dagId, new TaskAttemptStartedEvent(TezTaskAttemptID.getInstance(tezTaskID, 1), "test", time, ContainerId.newContainerId(attemptId, 1), NodeId.newInstance("localhost", 8765), null, null, null)));
return historyEvents;
}
use of org.apache.tez.dag.history.events.AMStartedEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testConvertAMStartedEvent.
@Test(timeout = 5000)
public void testConvertAMStartedEvent() {
long startTime = random.nextLong();
AMStartedEvent event = new AMStartedEvent(applicationAttemptId, startTime, user);
List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
Assert.assertEquals(1, entities.size());
TimelineEntity timelineEntity = entities.get(0);
Assert.assertEquals("tez_" + applicationAttemptId.toString(), timelineEntity.getEntityId());
Assert.assertEquals(EntityTypes.TEZ_APPLICATION_ATTEMPT.name(), timelineEntity.getEntityType());
final Map<String, Set<String>> relatedEntities = timelineEntity.getRelatedEntities();
Assert.assertEquals(0, relatedEntities.size());
final Map<String, Set<Object>> primaryFilters = timelineEntity.getPrimaryFilters();
Assert.assertEquals(2, primaryFilters.size());
Assert.assertTrue(primaryFilters.get(ATSConstants.USER).contains(user));
Assert.assertTrue(primaryFilters.get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
Assert.assertEquals(1, timelineEntity.getEvents().size());
TimelineEvent evt = timelineEntity.getEvents().get(0);
Assert.assertEquals(HistoryEventType.AM_STARTED.name(), evt.getEventType());
Assert.assertEquals(startTime, evt.getTimestamp());
}
use of org.apache.tez.dag.history.events.AMStartedEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
HistoryEventTimelineConversion.convertToTimelineEntities(event);
}
}
use of org.apache.tez.dag.history.events.AMStartedEvent in project tez by apache.
the class TestATSV15HistoryLoggingService method makeHistoryEvents.
private List<DAGHistoryEvent> makeHistoryEvents(TezDAGID dagId, ATSV15HistoryLoggingService service) {
List<DAGHistoryEvent> historyEvents = new ArrayList<>();
long time = System.currentTimeMillis();
Configuration conf = new Configuration(service.getConfig());
historyEvents.add(new DAGHistoryEvent(null, new AMStartedEvent(attemptId, time, user)));
historyEvents.add(new DAGHistoryEvent(dagId, new DAGSubmittedEvent(dagId, time, DAGPlan.getDefaultInstance(), attemptId, null, user, conf, null, "default")));
TezVertexID vertexID = TezVertexID.getInstance(dagId, 1);
historyEvents.add(new DAGHistoryEvent(dagId, new VertexStartedEvent(vertexID, time, time)));
TezTaskID tezTaskID = TezTaskID.getInstance(vertexID, 1);
historyEvents.add(new DAGHistoryEvent(dagId, new TaskStartedEvent(tezTaskID, "test", time, time)));
historyEvents.add(new DAGHistoryEvent(dagId, new TaskAttemptStartedEvent(TezTaskAttemptID.getInstance(tezTaskID, 1), "test", time, ContainerId.newContainerId(attemptId, 1), NodeId.newInstance("localhost", 8765), null, null, null)));
return historyEvents;
}
Aggregations