use of org.apache.tez.dag.history.events.DAGSubmittedEvent in project tez by apache.
the class RecoveryService method handle.
public void handle(DAGHistoryEvent event) throws IOException {
if (stopped.get()) {
LOG.warn("Igoring event as service stopped, eventType" + event.getHistoryEvent().getEventType());
return;
}
HistoryEventType eventType = event.getHistoryEvent().getEventType();
if (recoveryFatalErrorOccurred.get()) {
return;
}
if (!started.get()) {
LOG.warn("Adding event of type " + eventType + " to queue as service not started");
addToEventQueue(event);
return;
}
TezDAGID dagId = event.getDagID();
if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
DAGSubmittedEvent dagSubmittedEvent = (DAGSubmittedEvent) event.getHistoryEvent();
String dagName = dagSubmittedEvent.getDAGName();
if (dagName != null && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
// Skip recording pre-warm DAG events
skippedDAGs.add(dagId);
return;
}
}
if (dagId == null || skippedDAGs.contains(dagId)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping event for DAG" + ", eventType=" + eventType + ", dagId=" + (dagId == null ? "null" : dagId.toString()) + ", isSkippedDAG=" + (dagId == null ? "null" : skippedDAGs.contains(dagId)));
}
return;
}
if (event.getHistoryEvent() instanceof SummaryEvent) {
synchronized (lock) {
if (stopped.get()) {
LOG.warn("Igoring event as service stopped, eventType" + event.getHistoryEvent().getEventType());
return;
}
try {
SummaryEvent summaryEvent = (SummaryEvent) event.getHistoryEvent();
handleSummaryEvent(dagId, eventType, summaryEvent);
if (summaryEvent.writeToRecoveryImmediately()) {
handleRecoveryEvent(event);
// outputStream may already be closed and removed
if (outputStreamMap.containsKey(event.getDagID())) {
doFlush(outputStreamMap.get(event.getDagID()), appContext.getClock().getTime());
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Queueing Non-immediate Summary/Recovery event of type" + eventType.name());
}
addToEventQueue(event);
}
if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
LOG.info("DAG completed" + ", dagId=" + event.getDagID() + ", queueSize=" + eventQueue.size());
completedDAGs.add(dagId);
if (outputStreamMap.containsKey(dagId)) {
try {
outputStreamMap.get(dagId).close();
outputStreamMap.remove(dagId);
} catch (IOException ioe) {
LOG.warn("Error when trying to flush/close recovery file for" + " dag, dagId=" + event.getDagID());
}
}
}
} catch (IOException ioe) {
LOG.error("Error handling summary event" + ", eventType=" + event.getHistoryEvent().getEventType(), ioe);
createFatalErrorFlagDir();
if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
// Throw error to tell client that dag submission failed
throw ioe;
}
}
}
} else {
// All other events just get queued
if (LOG.isDebugEnabled()) {
LOG.debug("Queueing Non-Summary Recovery event of type " + eventType.name());
}
addToEventQueue(event);
}
}
use of org.apache.tez.dag.history.events.DAGSubmittedEvent in project tez by apache.
the class TestATSHistoryWithACLs method testDagLoggingEnabled.
/**
* use mini cluster to verify data do push to ats when
* the dag logging flag in dagsubmitted event is set on
* @throws Exception
*/
@Test(timeout = 50000)
public void testDagLoggingEnabled() throws Exception {
ATSHistoryLoggingService historyLoggingService;
historyLoggingService = ReflectionUtils.createClazzInstance(ATSHistoryLoggingService.class.getName());
AppContext appContext = mock(AppContext.class);
when(appContext.getApplicationID()).thenReturn(ApplicationId.newInstance(0, 1));
historyLoggingService.setAppContext(appContext);
TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
String viewAcls = "nobody nobody_group";
tezConf.set(TezConfiguration.TEZ_AM_VIEW_ACLS, viewAcls);
tezConf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService.class.getName());
Path remoteStagingDir = remoteFs.makeQualified(new Path("/tmp", String.valueOf(random.nextInt(100000))));
remoteFs.mkdirs(remoteStagingDir);
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
historyLoggingService.init(tezConf);
historyLoggingService.start();
ApplicationId appId = ApplicationId.newInstance(100l, 1);
TezDAGID tezDAGID = TezDAGID.getInstance(appId, 11);
ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 1);
DAGPlan dagPlan = DAGPlan.newBuilder().setName("DAGPlanMock").build();
DAGSubmittedEvent submittedEvent = new DAGSubmittedEvent(tezDAGID, 1, dagPlan, appAttemptId, null, "usr", tezConf, null, null);
submittedEvent.setHistoryLoggingEnabled(true);
DAGHistoryEvent event = new DAGHistoryEvent(tezDAGID, submittedEvent);
historyLoggingService.handle(new DAGHistoryEvent(tezDAGID, submittedEvent));
Thread.sleep(1000l);
String url = "http://" + timelineAddress + "/ws/v1/timeline/TEZ_DAG_ID/" + event.getDagID();
Client client = new Client();
WebResource resource = client.resource(url);
ClientResponse response = resource.accept(MediaType.APPLICATION_JSON).get(ClientResponse.class);
assertEquals(200, response.getStatus());
assertEquals(MediaType.APPLICATION_JSON_TYPE, response.getType());
TimelineEntity entity = response.getEntity(TimelineEntity.class);
assertEquals(entity.getEntityType(), "TEZ_DAG_ID");
assertEquals(entity.getEvents().get(0).getEventType(), HistoryEventType.DAG_SUBMITTED.toString());
}
use of org.apache.tez.dag.history.events.DAGSubmittedEvent in project tez by apache.
the class RecoveryParser method parseRecoveryData.
/**
* 1. Read Summary Recovery file and build DAGSummaryData
* Check whether it is recoverable based on the summary file (whether dag is
* in the middle of committing)
* 2. Read the non-Summary Recovery file and build DAGRecoveryData
* Check whether it is recoverable based on both the summary file and non-summary file
* (whether vertex has completed its committing, but its full non-summary recovery events are not seen)
* @return DAGRecoveryData
* @throws IOException
*/
public DAGRecoveryData parseRecoveryData() throws IOException {
int dagCounter = 0;
Map<TezDAGID, DAGSummaryData> dagSummaryDataMap = new HashMap<TezDAGID, DAGSummaryData>();
List<Path> summaryFiles = getSummaryFiles();
LOG.debug("SummaryFile size:" + summaryFiles.size());
for (Path summaryFile : summaryFiles) {
FileStatus summaryFileStatus = recoveryFS.getFileStatus(summaryFile);
LOG.info("Parsing summary file" + ", path=" + summaryFile.toString() + ", len=" + summaryFileStatus.getLen() + ", lastModTime=" + summaryFileStatus.getModificationTime());
FSDataInputStream summaryStream = getSummaryStream(summaryFile);
while (true) {
RecoveryProtos.SummaryEventProto proto;
try {
proto = RecoveryProtos.SummaryEventProto.parseDelimitedFrom(summaryStream);
if (proto == null) {
LOG.info("Reached end of summary stream");
break;
}
} catch (EOFException eof) {
LOG.info("Reached end of summary stream");
break;
}
HistoryEventType eventType = HistoryEventType.values()[proto.getEventType()];
if (LOG.isDebugEnabled()) {
LOG.debug("[RECOVERY SUMMARY]" + " dagId=" + proto.getDagId() + ", timestamp=" + proto.getTimestamp() + ", event=" + eventType);
}
TezDAGID dagId;
try {
dagId = TezDAGID.fromString(proto.getDagId());
} catch (IllegalArgumentException e) {
throw new IOException("Invalid dagId, summary records may be corrupted", e);
}
if (dagCounter < dagId.getId()) {
dagCounter = dagId.getId();
}
if (!dagSummaryDataMap.containsKey(dagId)) {
dagSummaryDataMap.put(dagId, new DAGSummaryData(dagId));
}
try {
dagSummaryDataMap.get(dagId).handleSummaryEvent(proto);
} catch (Exception e) {
// any exception when parsing protobuf
throw new IOException("Error when parsing summary event proto", e);
}
}
summaryStream.close();
}
// Set counter for next set of DAGs & update dagNames Set in DAGAppMaster
dagAppMaster.setDAGCounter(dagCounter);
for (DAGSummaryData dagSummaryData : dagSummaryDataMap.values()) {
dagAppMaster.dagIDs.add(dagSummaryData.dagId.toString());
}
DAGSummaryData lastInProgressDAGData = getLastCompletedOrInProgressDAG(dagSummaryDataMap);
if (lastInProgressDAGData == null) {
LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
return null;
}
TezDAGID lastInProgressDAG = lastInProgressDAGData.dagId;
if (lastInProgressDAG == null) {
LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
return null;
}
LOG.info("Checking if DAG is in recoverable state" + ", dagId=" + lastInProgressDAGData.dagId);
final DAGRecoveryData recoveredDAGData = new DAGRecoveryData(lastInProgressDAGData);
List<Path> dagRecoveryFiles = getDAGRecoveryFiles(lastInProgressDAG);
boolean skipAllOtherEvents = false;
Path lastRecoveryFile = null;
// to create the DAGImpl)
for (Path dagRecoveryFile : dagRecoveryFiles) {
if (skipAllOtherEvents) {
LOG.warn("Other recovery files will be skipped due to error in the previous recovery file" + lastRecoveryFile);
break;
}
FileStatus fileStatus = recoveryFS.getFileStatus(dagRecoveryFile);
lastRecoveryFile = dagRecoveryFile;
LOG.info("Trying to recover dag from recovery file" + ", dagId=" + lastInProgressDAG.toString() + ", dagRecoveryFile=" + dagRecoveryFile + ", len=" + fileStatus.getLen());
FSDataInputStream dagRecoveryStream = recoveryFS.open(dagRecoveryFile, recoveryBufferSize);
while (true) {
HistoryEvent event;
try {
event = getNextEvent(dagRecoveryStream);
if (event == null) {
LOG.info("Reached end of dag recovery stream");
break;
}
} catch (EOFException eof) {
LOG.info("Reached end of dag recovery stream");
break;
} catch (IOException ioe) {
LOG.warn("Corrupt data found when trying to read next event", ioe);
break;
}
if (skipAllOtherEvents) {
// hit an error - skip reading other events
break;
}
HistoryEventType eventType = event.getEventType();
LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString());
switch(eventType) {
case DAG_SUBMITTED:
DAGSubmittedEvent submittedEvent = (DAGSubmittedEvent) event;
recoveredDAGData.recoveredDAG = dagAppMaster.createDAG(submittedEvent.getDAGPlan(), lastInProgressDAG);
recoveredDAGData.cumulativeAdditionalResources = submittedEvent.getCumulativeAdditionalLocalResources();
recoveredDAGData.recoveredDagID = recoveredDAGData.recoveredDAG.getID();
dagAppMaster.setCurrentDAG(recoveredDAGData.recoveredDAG);
if (recoveredDAGData.nonRecoverable) {
skipAllOtherEvents = true;
}
break;
case DAG_INITIALIZED:
recoveredDAGData.dagInitedEvent = (DAGInitializedEvent) event;
break;
case DAG_STARTED:
recoveredDAGData.dagStartedEvent = (DAGStartedEvent) event;
break;
case DAG_FINISHED:
recoveredDAGData.dagFinishedEvent = (DAGFinishedEvent) event;
skipAllOtherEvents = true;
break;
case DAG_COMMIT_STARTED:
case VERTEX_GROUP_COMMIT_STARTED:
case VERTEX_GROUP_COMMIT_FINISHED:
case CONTAINER_LAUNCHED:
{
// Nothing to do for now
break;
}
case DAG_KILL_REQUEST:
{
break;
}
case VERTEX_INITIALIZED:
{
VertexInitializedEvent vertexInitEvent = (VertexInitializedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(vertexInitEvent.getVertexID());
vertexRecoveryData.vertexInitedEvent = vertexInitEvent;
break;
}
case VERTEX_CONFIGURE_DONE:
{
VertexConfigurationDoneEvent reconfigureDoneEvent = (VertexConfigurationDoneEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(reconfigureDoneEvent.getVertexID());
vertexRecoveryData.vertexConfigurationDoneEvent = reconfigureDoneEvent;
break;
}
case VERTEX_STARTED:
{
VertexStartedEvent vertexStartedEvent = (VertexStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(vertexStartedEvent.getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "No VertexInitializedEvent before VertexStartedEvent");
vertexRecoveryData.vertexStartedEvent = vertexStartedEvent;
break;
}
case VERTEX_COMMIT_STARTED:
{
break;
}
case VERTEX_FINISHED:
{
VertexFinishedEvent vertexFinishedEvent = (VertexFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(vertexFinishedEvent.getVertexID());
vertexRecoveryData.vertexFinishedEvent = vertexFinishedEvent;
break;
}
case TASK_STARTED:
{
TaskStartedEvent taskStartedEvent = (TaskStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taskStartedEvent.getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskStartedEvent, its vertex does not exist:" + taskStartedEvent.getTaskID().getVertexID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.maybeCreateTaskRecoveryData(taskStartedEvent.getTaskID());
taskRecoveryData.taskStartedEvent = taskStartedEvent;
break;
}
case TASK_FINISHED:
{
TaskFinishedEvent taskFinishedEvent = (TaskFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taskFinishedEvent.getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskFinishedEvent, its vertex does not exist:" + taskFinishedEvent.getTaskID().getVertexID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.maybeCreateTaskRecoveryData(taskFinishedEvent.getTaskID());
taskRecoveryData.taskFinishedEvent = taskFinishedEvent;
break;
}
case TASK_ATTEMPT_STARTED:
{
TaskAttemptStartedEvent taStartedEvent = (TaskAttemptStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taStartedEvent.getTaskAttemptID().getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskAttemptStartedEvent, its vertexId does not exist, taId=" + taStartedEvent.getTaskAttemptID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.taskRecoveryDataMap.get(taStartedEvent.getTaskAttemptID().getTaskID());
Preconditions.checkArgument(taskRecoveryData != null, "Invalid TaskAttemptStartedEvent, its taskId does not exist, taId=" + taStartedEvent.getTaskAttemptID());
TaskAttemptRecoveryData taRecoveryData = taskRecoveryData.maybeCreateTaskAttemptRecoveryData(taStartedEvent.getTaskAttemptID());
taRecoveryData.taStartedEvent = taStartedEvent;
break;
}
case TASK_ATTEMPT_FINISHED:
{
TaskAttemptFinishedEvent taFinishedEvent = (TaskAttemptFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taFinishedEvent.getTaskAttemptID().getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskAttemtFinishedEvent, its vertexId does not exist, taId=" + taFinishedEvent.getTaskAttemptID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.taskRecoveryDataMap.get(taFinishedEvent.getTaskAttemptID().getTaskID());
Preconditions.checkArgument(taskRecoveryData != null, "Invalid TaskAttemptFinishedEvent, its taskId does not exist, taId=" + taFinishedEvent.getTaskAttemptID());
TaskAttemptRecoveryData taRecoveryData = taskRecoveryData.maybeCreateTaskAttemptRecoveryData(taFinishedEvent.getTaskAttemptID());
taRecoveryData.taFinishedEvent = taFinishedEvent;
break;
}
default:
throw new RuntimeException("Invalid data found, unknown event type " + eventType);
}
if (LOG.isDebugEnabled()) {
LOG.debug("[DAG RECOVERY]" + " dagId=" + lastInProgressDAG + ", eventType=" + eventType + ", event=" + event.toString());
}
}
dagRecoveryStream.close();
}
recoveredDAGData.checkRecoverableNonSummary();
return recoveredDAGData;
}
use of org.apache.tez.dag.history.events.DAGSubmittedEvent in project tez by apache.
the class RecoveryParser method getNextEvent.
private static HistoryEvent getNextEvent(FSDataInputStream inputStream) throws IOException {
int eventTypeOrdinal = -1;
try {
eventTypeOrdinal = inputStream.readInt();
} catch (EOFException eof) {
return null;
}
if (eventTypeOrdinal < 0 || eventTypeOrdinal >= HistoryEventType.values().length) {
// reached end
throw new IOException("Corrupt data found when trying to read next event type" + ", eventTypeOrdinal=" + eventTypeOrdinal);
}
HistoryEventType eventType = HistoryEventType.values()[eventTypeOrdinal];
HistoryEvent event;
switch(eventType) {
case AM_LAUNCHED:
event = new AMLaunchedEvent();
break;
case AM_STARTED:
event = new AMStartedEvent();
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent();
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent();
break;
case DAG_STARTED:
event = new DAGStartedEvent();
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case DAG_FINISHED:
event = new DAGFinishedEvent();
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent();
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent();
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent();
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent();
break;
case VERTEX_STARTED:
event = new VertexStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent();
break;
case TASK_STARTED:
event = new TaskStartedEvent();
break;
case TASK_FINISHED:
event = new TaskFinishedEvent();
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent();
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent();
break;
default:
throw new IOException("Invalid data found, unknown event type " + eventType);
}
try {
event.fromProtoStream(inputStream);
} catch (EOFException eof) {
return null;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Parsed event from input stream" + ", eventType=" + eventType + ", event=" + event.toString());
}
return event;
}
use of org.apache.tez.dag.history.events.DAGSubmittedEvent in project tez by apache.
the class DAGAppMaster method startDAG.
private void startDAG(DAGPlan dagPlan, Map<String, LocalResource> additionalAMResources) throws TezException {
long submitTime = this.clock.getTime();
this.appName = dagPlan.getName();
// /////////////////// Create the job itself.
final DAG newDAG = createDAG(dagPlan);
_updateLoggers(newDAG, "");
if (LOG.isDebugEnabled()) {
LOG.debug("Running a DAG with " + dagPlan.getVertexCount() + " vertices ");
for (VertexPlan v : dagPlan.getVertexList()) {
LOG.debug("DAG has vertex " + v.getName());
}
}
Map<String, LocalResource> lrDiff = getAdditionalLocalResourceDiff(newDAG, additionalAMResources);
if (lrDiff != null) {
amResources.putAll(lrDiff);
cumulativeAdditionalResources.putAll(lrDiff);
}
String callerContextStr = "";
if (dagPlan.hasCallerContext()) {
CallerContext callerContext = DagTypeConverters.convertCallerContextFromProto(dagPlan.getCallerContext());
callerContextStr = ", callerContext=" + callerContext.contextAsSimpleString();
}
LOG.info("Running DAG: " + dagPlan.getName() + callerContextStr);
String timeStamp = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime());
System.err.println(timeStamp + " Running Dag: " + newDAG.getID());
System.out.println(timeStamp + " Running Dag: " + newDAG.getID());
// Job name is the same as the app name until we support multiple dags
// for an app later
final DAGSubmittedEvent submittedEvent = new DAGSubmittedEvent(newDAG.getID(), submitTime, dagPlan, this.appAttemptID, cumulativeAdditionalResources, newDAG.getUserName(), newDAG.getConf(), containerLogs, getContext().getQueueName());
boolean dagLoggingEnabled = newDAG.getConf().getBoolean(TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED, TezConfiguration.TEZ_DAG_HISTORY_LOGGING_ENABLED_DEFAULT);
submittedEvent.setHistoryLoggingEnabled(dagLoggingEnabled);
try {
appMasterUgi.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws Exception {
historyEventHandler.handleCriticalEvent(new DAGHistoryEvent(newDAG.getID(), submittedEvent));
return null;
}
});
} catch (IOException e) {
throw new TezUncheckedException(e);
} catch (InterruptedException e) {
throw new TezUncheckedException(e);
}
startDAGExecution(newDAG, lrDiff);
// set state after curDag is set
this.state = DAGAppMasterState.RUNNING;
}
Aggregations