use of org.apache.tez.dag.history.DAGHistoryEvent in project tez by apache.
the class DAGAppMaster method serviceInit.
@Override
public synchronized void serviceInit(final Configuration conf) throws Exception {
this.amConf = conf;
initResourceCalculatorPlugins();
this.hadoopShim = new HadoopShimsLoader(this.amConf).getHadoopShim();
long sleepTimeBeforeSecs = this.amConf.getLong(TezConfiguration.TEZ_AM_SLEEP_TIME_BEFORE_EXIT_MILLIS, TezConstants.TEZ_DAG_SLEEP_TIME_BEFORE_EXIT);
if (sleepTimeBeforeSecs >= 0) {
this.shutdownHandler.setSleepTimeBeforeExit(sleepTimeBeforeSecs);
}
this.isLocal = conf.getBoolean(TezConfiguration.TEZ_LOCAL_MODE, TezConfiguration.TEZ_LOCAL_MODE_DEFAULT);
UserPayload defaultPayload = TezUtils.createUserPayloadFromConf(amConf);
List<NamedEntityDescriptor> taskSchedulerDescriptors = Lists.newLinkedList();
List<NamedEntityDescriptor> containerLauncherDescriptors = Lists.newLinkedList();
List<NamedEntityDescriptor> taskCommunicatorDescriptors = Lists.newLinkedList();
parseAllPlugins(taskSchedulerDescriptors, taskSchedulers, containerLauncherDescriptors, containerLaunchers, taskCommunicatorDescriptors, taskCommunicators, amPluginDescriptorProto, isLocal, defaultPayload);
LOG.info(buildPluginComponentLog(taskSchedulerDescriptors, taskSchedulers, "TaskSchedulers"));
LOG.info(buildPluginComponentLog(containerLauncherDescriptors, containerLaunchers, "ContainerLaunchers"));
LOG.info(buildPluginComponentLog(taskCommunicatorDescriptors, taskCommunicators, "TaskCommunicators"));
boolean disableVersionCheck = conf.getBoolean(TezConfiguration.TEZ_AM_DISABLE_CLIENT_VERSION_CHECK, TezConfiguration.TEZ_AM_DISABLE_CLIENT_VERSION_CHECK_DEFAULT);
// Check client - AM version compatibility
LOG.info("Comparing client version with AM version" + ", clientVersion=" + clientVersion + ", AMVersion=" + dagVersionInfo.getVersion());
Simple2LevelVersionComparator versionComparator = new Simple2LevelVersionComparator();
if (versionComparator.compare(clientVersion, dagVersionInfo.getVersion()) != 0) {
versionMismatchDiagnostics = "Incompatible versions found" + ", clientVersion=" + clientVersion + ", AMVersion=" + dagVersionInfo.getVersion();
addDiagnostic(versionMismatchDiagnostics);
if (disableVersionCheck) {
LOG.warn("Ignoring client-AM version mismatch as check disabled. " + versionMismatchDiagnostics);
} else {
LOG.error(versionMismatchDiagnostics);
versionMismatch = true;
}
}
dispatcher = createDispatcher();
if (isLocal) {
conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
conf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS_DEFAULT);
} else {
dispatcher.enableExitOnDispatchException();
}
String strAppId = this.appAttemptID.getApplicationId().toString();
this.tezSystemStagingDir = TezCommonUtils.getTezSystemStagingPath(conf, strAppId);
context = new RunningAppContext(conf);
this.aclManager = new ACLManager(appMasterUgi.getShortUserName(), this.amConf);
clientHandler = new DAGClientHandler(this);
addIfService(dispatcher, false);
recoveryDataDir = TezCommonUtils.getRecoveryPath(tezSystemStagingDir, conf);
recoveryFS = recoveryDataDir.getFileSystem(conf);
currentRecoveryDataDir = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, appAttemptID.getAttemptId());
if (LOG.isDebugEnabled()) {
LOG.debug("Stage directory information for AppAttemptId :" + this.appAttemptID + " tezSystemStagingDir :" + tezSystemStagingDir + " recoveryDataDir :" + recoveryDataDir + " recoveryAttemptDir :" + currentRecoveryDataDir);
}
recoveryEnabled = conf.getBoolean(TezConfiguration.DAG_RECOVERY_ENABLED, TezConfiguration.DAG_RECOVERY_ENABLED_DEFAULT);
clientRpcServer = new DAGClientServer(clientHandler, appAttemptID, recoveryFS);
addIfService(clientRpcServer, true);
taskHeartbeatHandler = createTaskHeartbeatHandler(context, conf);
addIfService(taskHeartbeatHandler, true);
containerHeartbeatHandler = createContainerHeartbeatHandler(context, conf);
addIfService(containerHeartbeatHandler, true);
sessionToken = TokenCache.getSessionToken(amCredentials);
if (sessionToken == null) {
throw new RuntimeException("Could not find session token in AM Credentials");
}
// Prepare the TaskAttemptListener server for authentication of Containers
// TaskAttemptListener gets the information via jobTokenSecretManager.
jobTokenSecretManager.addTokenForJob(appAttemptID.getApplicationId().toString(), sessionToken);
// service to handle requests to TaskUmbilicalProtocol
taskCommunicatorManager = createTaskCommunicatorManager(context, taskHeartbeatHandler, containerHeartbeatHandler, taskCommunicatorDescriptors);
addIfService(taskCommunicatorManager, true);
containerSignatureMatcher = createContainerSignatureMatcher();
containers = new AMContainerMap(containerHeartbeatHandler, taskCommunicatorManager, containerSignatureMatcher, context);
addIfService(containers, true);
dispatcher.register(AMContainerEventType.class, containers);
nodes = new AMNodeTracker(dispatcher.getEventHandler(), context);
addIfService(nodes, true);
dispatcher.register(AMNodeEventType.class, nodes);
this.dagEventDispatcher = new DagEventDispatcher();
this.vertexEventDispatcher = new VertexEventDispatcher();
// register the event dispatchers
dispatcher.register(DAGAppMasterEventType.class, new DAGAppMasterEventHandler());
dispatcher.register(DAGEventType.class, dagEventDispatcher);
dispatcher.register(VertexEventType.class, vertexEventDispatcher);
boolean useConcurrentDispatcher = conf.getBoolean(TezConfiguration.TEZ_AM_USE_CONCURRENT_DISPATCHER, TezConfiguration.TEZ_AM_USE_CONCURRENT_DISPATCHER_DEFAULT);
LOG.info("Using concurrent dispatcher: " + useConcurrentDispatcher);
if (!useConcurrentDispatcher) {
dispatcher.register(TaskEventType.class, new TaskEventDispatcher());
dispatcher.register(TaskAttemptEventType.class, new TaskAttemptEventDispatcher());
} else {
int concurrency = conf.getInt(TezConfiguration.TEZ_AM_CONCURRENT_DISPATCHER_CONCURRENCY, TezConfiguration.TEZ_AM_CONCURRENT_DISPATCHER_CONCURRENCY_DEFAULT);
AsyncDispatcherConcurrent sharedDispatcher = dispatcher.registerAndCreateDispatcher(TaskEventType.class, new TaskEventDispatcher(), "TaskAndAttemptEventThread", concurrency);
dispatcher.registerWithExistingDispatcher(TaskAttemptEventType.class, new TaskAttemptEventDispatcher(), sharedDispatcher);
}
// register other delegating dispatchers
dispatcher.registerAndCreateDispatcher(SpeculatorEventType.class, new SpeculatorEventHandler(), "Speculator");
if (enableWebUIService()) {
this.webUIService = new WebUIService(context);
addIfService(webUIService, false);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Web UI Service is not enabled.");
}
}
this.taskSchedulerManager = createTaskSchedulerManager(taskSchedulerDescriptors);
addIfService(taskSchedulerManager, true);
if (enableWebUIService()) {
addIfServiceDependency(taskSchedulerManager, webUIService);
}
dispatcher.register(AMSchedulerEventType.class, taskSchedulerManager);
addIfServiceDependency(taskSchedulerManager, clientRpcServer);
this.containerLauncherManager = createContainerLauncherManager(containerLauncherDescriptors, isLocal);
addIfService(containerLauncherManager, true);
dispatcher.register(ContainerLauncherEventType.class, containerLauncherManager);
historyEventHandler = createHistoryEventHandler(context);
addIfService(historyEventHandler, true);
this.sessionTimeoutInterval = TezCommonUtils.getDAGSessionTimeout(amConf);
this.clientAMHeartbeatTimeoutIntervalMillis = TezCommonUtils.getAMClientHeartBeatTimeoutMillis(amConf);
if (!versionMismatch) {
if (isSession) {
FileInputStream sessionResourcesStream = null;
try {
sessionResourcesStream = new FileInputStream(new File(workingDirectory, TezConstants.TEZ_AM_LOCAL_RESOURCES_PB_FILE_NAME));
PlanLocalResourcesProto amLocalResourceProto = PlanLocalResourcesProto.parseDelimitedFrom(sessionResourcesStream);
amResources.putAll(DagTypeConverters.convertFromPlanLocalResources(amLocalResourceProto));
} finally {
if (sessionResourcesStream != null) {
sessionResourcesStream.close();
}
}
}
}
rawExecutor = Executors.newCachedThreadPool(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("App Shared Pool - " + "#%d").build());
execService = MoreExecutors.listeningDecorator(rawExecutor);
initServices(conf);
super.serviceInit(conf);
if (!versionMismatch) {
if (this.appAttemptID.getAttemptId() == 1) {
AppLaunchedEvent appLaunchedEvent = new AppLaunchedEvent(appAttemptID.getApplicationId(), startTime, appSubmitTime, appMasterUgi.getShortUserName(), this.amConf, dagVersionInfo);
historyEventHandler.handle(new DAGHistoryEvent(appLaunchedEvent));
}
AMLaunchedEvent launchedEvent = new AMLaunchedEvent(appAttemptID, startTime, appSubmitTime, appMasterUgi.getShortUserName());
historyEventHandler.handle(new DAGHistoryEvent(launchedEvent));
this.state = DAGAppMasterState.INITED;
} else {
this.state = DAGAppMasterState.ERROR;
}
}
use of org.apache.tez.dag.history.DAGHistoryEvent in project tez by apache.
the class DAGAppMaster method serviceStart.
@Override
public synchronized void serviceStart() throws Exception {
// start all the components
startServices();
super.serviceStart();
boolean invalidSession = false;
if (isSession && !recoveryEnabled && appAttemptID.getAttemptId() > 1) {
String err = INVALID_SESSION_ERR_MSG;
LOG.error(err);
addDiagnostic(err);
this.state = DAGAppMasterState.ERROR;
invalidSession = true;
}
if (versionMismatch || invalidSession) {
// Short-circuit and return as no DAG should be run
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
this.appsStartTime = clock.getTime();
AMStartedEvent startEvent = new AMStartedEvent(appAttemptID, appsStartTime, appMasterUgi.getShortUserName());
historyEventHandler.handle(new DAGHistoryEvent(startEvent));
this.lastDAGCompletionTime = clock.getTime();
DAGRecoveryData recoveredDAGData;
try {
recoveredDAGData = recoverDAG();
} catch (IOException e) {
LOG.error("Error occurred when trying to recover data from previous attempt." + " Shutting down AM", e);
this.state = DAGAppMasterState.ERROR;
this.taskSchedulerManager.setShouldUnregisterFlag();
shutdownHandler.shutdown();
return;
}
if (!isSession) {
LOG.info("In Non-Session mode.");
} else {
LOG.info("In Session mode. Waiting for DAG over RPC");
this.state = DAGAppMasterState.IDLE;
}
if (recoveredDAGData != null) {
if (recoveredDAGData.cumulativeAdditionalResources != null) {
recoveredDAGData.additionalUrlsForClasspath = processAdditionalResources(recoveredDAGData.recoveredDagID, recoveredDAGData.cumulativeAdditionalResources);
amResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
cumulativeAdditionalResources.putAll(recoveredDAGData.cumulativeAdditionalResources);
}
if (recoveredDAGData.isSessionStopped) {
LOG.info("AM crashed when shutting down in the previous attempt" + ", continue the shutdown and recover it to SUCCEEDED");
this.sessionStopped.set(true);
return;
}
if (recoveredDAGData.isCompleted || recoveredDAGData.nonRecoverable) {
LOG.info("Found previous DAG in completed or non-recoverable state" + ", dagId=" + recoveredDAGData.recoveredDagID + ", isCompleted=" + recoveredDAGData.isCompleted + ", isNonRecoverable=" + recoveredDAGData.nonRecoverable + ", state=" + (recoveredDAGData.dagState == null ? "null" : recoveredDAGData.dagState) + ", failureReason=" + recoveredDAGData.reason);
_updateLoggers(recoveredDAGData.recoveredDAG, "");
if (recoveredDAGData.nonRecoverable) {
addDiagnostic("DAG " + recoveredDAGData.recoveredDagID + " can not be recovered due to " + recoveredDAGData.reason);
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), DAGState.FAILED, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), DAGState.FAILED, recoveredDAGData.reason, this.containerLogs);
dagRecoveredEvent.setHistoryLoggingEnabled(recoveredDAGData.recoveredDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT));
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
} else {
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.dagState, recoveredDAGData);
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), recoveredDAGData.dagState, null, this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
LOG.info("Found DAG to recover, dagId=" + recoveredDAGData.recoveredDAG.getID());
_updateLoggers(recoveredDAGData.recoveredDAG, "");
DAGRecoveredEvent dagRecoveredEvent = new DAGRecoveredEvent(this.appAttemptID, recoveredDAGData.recoveredDAG.getID(), recoveredDAGData.recoveredDAG.getName(), recoveredDAGData.recoveredDAG.getUserName(), this.clock.getTime(), this.containerLogs);
this.historyEventHandler.handle(new DAGHistoryEvent(recoveredDAGData.recoveredDAG.getID(), dagRecoveredEvent));
DAGEventRecoverEvent recoverDAGEvent = new DAGEventRecoverEvent(recoveredDAGData.recoveredDAG.getID(), recoveredDAGData);
dagEventDispatcher.handle(recoverDAGEvent);
this.state = DAGAppMasterState.RUNNING;
}
} else {
if (!isSession) {
// No dag recovered - in non-session, just restart the original DAG
dagCounter.set(0);
startDAG();
}
}
if (isSession && sessionTimeoutInterval >= 0) {
this.dagSubmissionTimer = new Timer("DAGSubmissionTimer", true);
this.dagSubmissionTimer.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
try {
checkAndHandleSessionTimeout();
} catch (TezException e) {
LOG.error("Error when checking AM session timeout", e);
}
}
}, sessionTimeoutInterval, sessionTimeoutInterval / 10);
}
// Ignore client heartbeat timeout in local mode or non-session mode
if (!isLocal && isSession && clientAMHeartbeatTimeoutIntervalMillis > 0) {
// reset heartbeat time
clientHandler.updateLastHeartbeatTime();
this.clientAMHeartBeatTimeoutService = Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ClientAMHeartBeatKeepAliveCheck #%d").build());
this.clientAMHeartBeatTimeoutService.schedule(new Runnable() {
@Override
public void run() {
try {
long nextExpiry = checkAndHandleDAGClientTimeout();
if (nextExpiry > 0) {
clientAMHeartBeatTimeoutService.schedule(this, nextExpiry, TimeUnit.MILLISECONDS);
}
} catch (TezException e) {
// Cannot be thrown unless the AM is being tried to shutdown so no need to
// reschedule the timer task
LOG.error("Error when checking Client AM heartbeat timeout", e);
}
}
}, clientAMHeartbeatTimeoutIntervalMillis, TimeUnit.MILLISECONDS);
}
}
use of org.apache.tez.dag.history.DAGHistoryEvent in project tez by apache.
the class ATSHistoryLoggingService method getEventBatch.
private void getEventBatch(List<DAGHistoryEvent> events) throws InterruptedException {
events.clear();
int counter = 0;
while (counter < maxEventsPerBatch) {
DAGHistoryEvent event = eventQueue.poll(maxPollingTimeMillis, TimeUnit.MILLISECONDS);
if (event == null) {
break;
}
if (!isValidEvent(event)) {
continue;
}
++counter;
events.add(event);
if (event.getHistoryEvent().getEventType().equals(HistoryEventType.DAG_SUBMITTED)) {
// Special case this as it might be a large payload
break;
}
}
}
use of org.apache.tez.dag.history.DAGHistoryEvent in project tez by apache.
the class ATSHistoryLoggingService method getDomainForEvent.
private String getDomainForEvent(DAGHistoryEvent event) {
String domainId = sessionDomainId;
if (historyACLPolicyManager == null) {
return domainId;
}
TezDAGID dagId = event.getDagID();
HistoryEvent historyEvent = event.getHistoryEvent();
if (dagId == null || !HistoryEventType.isDAGSpecificEvent(historyEvent.getEventType())) {
return domainId;
}
if (dagDomainIdMap.containsKey(dagId)) {
// If we already have the domain for the dag id return it
domainId = dagDomainIdMap.get(dagId);
// Cleanup if this is the last event.
if (historyEvent.getEventType() == HistoryEventType.DAG_FINISHED) {
dagDomainIdMap.remove(dagId);
}
} else if (HistoryEventType.DAG_SUBMITTED == historyEvent.getEventType() || HistoryEventType.DAG_RECOVERED == historyEvent.getEventType()) {
// In case this is the first event for the dag, create and populate dag domain.
Configuration conf;
DAGPlan dagPlan;
if (HistoryEventType.DAG_SUBMITTED == historyEvent.getEventType()) {
conf = ((DAGSubmittedEvent) historyEvent).getConf();
dagPlan = ((DAGSubmittedEvent) historyEvent).getDAGPlan();
} else {
conf = appContext.getCurrentDAG().getConf();
dagPlan = appContext.getCurrentDAG().getJobPlan();
}
domainId = createDagDomain(conf, dagPlan, dagId);
// createDagDomain updates skippedDAGs so another check here.
if (skippedDAGs.contains(dagId)) {
return null;
}
dagDomainIdMap.put(dagId, domainId);
}
return domainId;
}
use of org.apache.tez.dag.history.DAGHistoryEvent in project tez by apache.
the class TestATSHistoryLoggingService method testNonSessionDomainsFailed.
@Test(timeout = 10000)
public void testNonSessionDomainsFailed() throws Exception {
when(historyACLPolicyManager.setupSessionACLs((Configuration) any(), (ApplicationId) any())).thenThrow(new IOException());
atsHistoryLoggingService.start();
verify(historyACLPolicyManager, times(1)).setupSessionACLs((Configuration) any(), (ApplicationId) any());
// Send the event and wait for completion.
TezDAGID dagId1 = TezDAGID.getInstance(appId, 0);
for (DAGHistoryEvent event : makeHistoryEvents(dagId1, atsHistoryLoggingService)) {
atsHistoryLoggingService.handle(event);
}
while (!atsHistoryLoggingService.eventQueue.isEmpty()) {
Thread.sleep(1000);
}
// No dag domain were created.
verify(historyACLPolicyManager, times(0)).setupSessionDAGACLs((Configuration) any(), eq(appId), eq("0"), (DAGAccessControls) any());
// All calls made with session domain id.
verify(historyACLPolicyManager, times(0)).updateTimelineEntityDomain(any(), eq("session-id"));
Assert.assertEquals(0, atsEntitiesCounter);
}
Aggregations