use of org.apache.tez.dag.history.events.VertexStartedEvent in project tez by apache.
the class TestDAGRecovery method testTARecoverFromSucceeded_OutputCommitterRecoveryNotSupported.
/**
* RecoveryEvents: TaskAttemptStartedEvent -> TaskAttemptFinishedEvent (SUCCEEDED)
* Recovered it SUCCEEDED, but task schedule new task attempt
* V2's committer is not recovery supported
*/
// (timeout=5000)
@Test
public void testTARecoverFromSucceeded_OutputCommitterRecoveryNotSupported() throws Exception {
initMockDAGRecoveryDataForTaskAttempt();
// set up v2 recovery data
// ta1t1v2: TaskAttemptStartedEvent -> TaskAttemptFinishedEvent(SUCCEEDED)
// t1v2: TaskStartedEvent
// v2: VertexInitializedEvent -> VertexConfigurationDoneEvent -> VertexStartedEvent
TaskAttemptStartedEvent taStartedEvent = new TaskAttemptStartedEvent(ta1t1v2Id, "vertex2", ta1LaunchTime, mock(ContainerId.class), mock(NodeId.class), "", "", "");
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
EventMetaData metadata = new EventMetaData(EventProducerConsumerType.OUTPUT, "vertex2", "vertex3", ta1t1v2Id);
taGeneratedEvents.add(new TezEvent(DataMovementEvent.create(ByteBuffer.wrap(new byte[0])), metadata));
TaskAttemptFinishedEvent taFinishedEvent = new TaskAttemptFinishedEvent(ta1t1v2Id, "vertex2", ta1LaunchTime, ta1FinishedTime, TaskAttemptState.SUCCEEDED, null, null, "", null, null, taGeneratedEvents, 0L, null, 0L, null, null, null, null, null);
TaskAttemptRecoveryData taRecoveryData = new TaskAttemptRecoveryData(taStartedEvent, taFinishedEvent);
doReturn(taRecoveryData).when(dagRecoveryData).getTaskAttemptRecoveryData(ta1t1v2Id);
Map<TezTaskAttemptID, TaskAttemptRecoveryData> taRecoveryDataMap = new HashMap<TezTaskAttemptID, TaskAttemptRecoveryData>();
taRecoveryDataMap.put(ta1t1v2Id, taRecoveryData);
TaskStartedEvent t1StartedEvent = new TaskStartedEvent(t1v2Id, "vertex2", 0L, t1StartedTime);
TaskRecoveryData taskRecoveryData = new TaskRecoveryData(t1StartedEvent, null, taRecoveryDataMap);
Map<TezTaskID, TaskRecoveryData> taskRecoveryDataMap = new HashMap<TezTaskID, TaskRecoveryData>();
taskRecoveryDataMap.put(t1v2Id, taskRecoveryData);
doReturn(taskRecoveryData).when(dagRecoveryData).getTaskRecoveryData(t1v2Id);
VertexInitializedEvent v2InitedEvent = new VertexInitializedEvent(v2Id, "vertex2", 0L, v1InitedTime, v1NumTask, "", null, null, null);
VertexConfigurationDoneEvent v2ReconfigureDoneEvent = new VertexConfigurationDoneEvent(v2Id, 0L, v1NumTask, null, null, null, false);
VertexStartedEvent v2StartedEvent = new VertexStartedEvent(v2Id, 0L, v1StartedTime);
VertexRecoveryData v2RecoveryData = new VertexRecoveryData(v2InitedEvent, v2ReconfigureDoneEvent, v2StartedEvent, null, taskRecoveryDataMap, false);
doReturn(v2RecoveryData).when(dagRecoveryData).getVertexRecoveryData(v2Id);
dag.handle(new DAGEventRecoverEvent(dagId, dagRecoveryData));
dispatcher.await();
TaskImpl task = (TaskImpl) dag.getVertex(v2Id).getTask(t1v2Id);
TaskAttemptImpl taskAttempt = (TaskAttemptImpl) task.getAttempt(ta1t1v2Id);
assertEquals(TaskAttemptStateInternal.KILLED, taskAttempt.getInternalState());
historyEventHandler.verifyHistoryEvent(1, HistoryEventType.TASK_ATTEMPT_FINISHED);
assertEquals(TaskStateInternal.RUNNING, task.getInternalState());
// new task attempt is scheduled
assertEquals(2, task.getAttempts().size());
assertEquals(ta1LaunchTime, taskAttempt.getLaunchTime());
assertEquals(ta1FinishedTime, taskAttempt.getFinishTime());
}
use of org.apache.tez.dag.history.events.VertexStartedEvent in project tez by apache.
the class TestHistoryEventJsonConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, null, "Q_" + eventType.name());
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.KILLED, null, TaskAttemptTerminationCause.TERMINATED_BY_CLIENT, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, 1l, null);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
JSONObject json = HistoryEventJsonConversion.convertToJson(event);
if (eventType == HistoryEventType.DAG_SUBMITTED) {
try {
Assert.assertEquals("Q_" + eventType.name(), json.getJSONObject(ATSConstants.OTHER_INFO).getString(ATSConstants.DAG_QUEUE_NAME));
Assert.assertEquals("Q_" + eventType.name(), json.getJSONObject(ATSConstants.PRIMARY_FILTERS).getString(ATSConstants.DAG_QUEUE_NAME));
} catch (JSONException ex) {
Assert.fail("Exception: " + ex.getMessage() + " for type: " + eventType);
}
}
}
}
use of org.apache.tez.dag.history.events.VertexStartedEvent in project tez by apache.
the class VertexImpl method startVertex.
private VertexState startVertex() {
Preconditions.checkState(getState() == VertexState.INITED, "Vertex must be inited " + logIdentifier);
if (recoveryData != null && recoveryData.isVertexStarted()) {
VertexStartedEvent vertexStartedEvent = recoveryData.getVertexStartedEvent();
this.startedTime = vertexStartedEvent.getStartTime();
} else {
this.startedTime = clock.getTime();
}
try {
vertexManager.onVertexStarted(getTaskAttemptIdentifiers(dag, pendingReportedSrcCompletions));
} catch (AMUserCodeException e) {
String msg = "Exception in " + e.getSource() + ", vertex=" + logIdentifier;
LOG.error(msg, e);
addDiagnostic(msg + "," + ExceptionUtils.getStackTrace(e.getCause()));
tryEnactKill(VertexTerminationCause.AM_USERCODE_FAILURE, TaskTerminationCause.AM_USERCODE_FAILURE);
return VertexState.TERMINATING;
}
pendingReportedSrcCompletions.clear();
logJobHistoryVertexStartedEvent();
// the vertex is fully configured by the time it starts. Always notify completely configured
// unless the vertex manager has told us that it is going to reconfigure it further.
// If the vertex was pre-configured then the event would have been sent out earlier. Calling again
// would be a no-op. If the vertex was not fully configured and waiting for that to complete then
// we would start immediately after that. Either parallelism updated (now) or IPO changed (future)
// or vertex added (future). Simplify these cases by sending the event now automatically for the
// user as if they had invoked the planned()/done() API's.
maybeSendConfiguredEvent();
// when we are ready
if (targetVertices != null) {
for (Vertex targetVertex : targetVertices.keySet()) {
eventHandler.handle(new VertexEventSourceVertexStarted(targetVertex.getVertexId(), getVertexId(), distanceFromRoot));
}
}
// If we have no tasks, just transition to vertex completed
if (this.numTasks == 0) {
eventHandler.handle(new VertexEvent(this.vertexId, VertexEventType.V_COMPLETED));
}
return VertexState.RUNNING;
}
use of org.apache.tez.dag.history.events.VertexStartedEvent in project tez by apache.
the class VertexImpl method logJobHistoryVertexStartedEvent.
void logJobHistoryVertexStartedEvent() {
if (recoveryData == null || !recoveryData.isVertexStarted()) {
VertexStartedEvent startEvt = new VertexStartedEvent(vertexId, startTimeRequested, startedTime);
this.appContext.getHistoryHandler().handle(new DAGHistoryEvent(getDAGId(), startEvt));
}
}
use of org.apache.tez.dag.history.events.VertexStartedEvent in project tez by apache.
the class RecoveryParser method parseRecoveryData.
/**
* 1. Read Summary Recovery file and build DAGSummaryData
* Check whether it is recoverable based on the summary file (whether dag is
* in the middle of committing)
* 2. Read the non-Summary Recovery file and build DAGRecoveryData
* Check whether it is recoverable based on both the summary file and non-summary file
* (whether vertex has completed its committing, but its full non-summary recovery events are not seen)
* @return DAGRecoveryData
* @throws IOException
*/
public DAGRecoveryData parseRecoveryData() throws IOException {
int dagCounter = 0;
Map<TezDAGID, DAGSummaryData> dagSummaryDataMap = new HashMap<TezDAGID, DAGSummaryData>();
List<Path> summaryFiles = getSummaryFiles();
LOG.debug("SummaryFile size:" + summaryFiles.size());
for (Path summaryFile : summaryFiles) {
FileStatus summaryFileStatus = recoveryFS.getFileStatus(summaryFile);
LOG.info("Parsing summary file" + ", path=" + summaryFile.toString() + ", len=" + summaryFileStatus.getLen() + ", lastModTime=" + summaryFileStatus.getModificationTime());
FSDataInputStream summaryStream = getSummaryStream(summaryFile);
while (true) {
RecoveryProtos.SummaryEventProto proto;
try {
proto = RecoveryProtos.SummaryEventProto.parseDelimitedFrom(summaryStream);
if (proto == null) {
LOG.info("Reached end of summary stream");
break;
}
} catch (EOFException eof) {
LOG.info("Reached end of summary stream");
break;
}
HistoryEventType eventType = HistoryEventType.values()[proto.getEventType()];
if (LOG.isDebugEnabled()) {
LOG.debug("[RECOVERY SUMMARY]" + " dagId=" + proto.getDagId() + ", timestamp=" + proto.getTimestamp() + ", event=" + eventType);
}
TezDAGID dagId;
try {
dagId = TezDAGID.fromString(proto.getDagId());
} catch (IllegalArgumentException e) {
throw new IOException("Invalid dagId, summary records may be corrupted", e);
}
if (dagCounter < dagId.getId()) {
dagCounter = dagId.getId();
}
if (!dagSummaryDataMap.containsKey(dagId)) {
dagSummaryDataMap.put(dagId, new DAGSummaryData(dagId));
}
try {
dagSummaryDataMap.get(dagId).handleSummaryEvent(proto);
} catch (Exception e) {
// any exception when parsing protobuf
throw new IOException("Error when parsing summary event proto", e);
}
}
summaryStream.close();
}
// Set counter for next set of DAGs & update dagNames Set in DAGAppMaster
dagAppMaster.setDAGCounter(dagCounter);
for (DAGSummaryData dagSummaryData : dagSummaryDataMap.values()) {
dagAppMaster.dagIDs.add(dagSummaryData.dagId.toString());
}
DAGSummaryData lastInProgressDAGData = getLastCompletedOrInProgressDAG(dagSummaryDataMap);
if (lastInProgressDAGData == null) {
LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
return null;
}
TezDAGID lastInProgressDAG = lastInProgressDAGData.dagId;
if (lastInProgressDAG == null) {
LOG.info("Nothing to recover as no uncompleted/completed DAGs found");
return null;
}
LOG.info("Checking if DAG is in recoverable state" + ", dagId=" + lastInProgressDAGData.dagId);
final DAGRecoveryData recoveredDAGData = new DAGRecoveryData(lastInProgressDAGData);
List<Path> dagRecoveryFiles = getDAGRecoveryFiles(lastInProgressDAG);
boolean skipAllOtherEvents = false;
Path lastRecoveryFile = null;
// to create the DAGImpl)
for (Path dagRecoveryFile : dagRecoveryFiles) {
if (skipAllOtherEvents) {
LOG.warn("Other recovery files will be skipped due to error in the previous recovery file" + lastRecoveryFile);
break;
}
FileStatus fileStatus = recoveryFS.getFileStatus(dagRecoveryFile);
lastRecoveryFile = dagRecoveryFile;
LOG.info("Trying to recover dag from recovery file" + ", dagId=" + lastInProgressDAG.toString() + ", dagRecoveryFile=" + dagRecoveryFile + ", len=" + fileStatus.getLen());
FSDataInputStream dagRecoveryStream = recoveryFS.open(dagRecoveryFile, recoveryBufferSize);
while (true) {
HistoryEvent event;
try {
event = getNextEvent(dagRecoveryStream);
if (event == null) {
LOG.info("Reached end of dag recovery stream");
break;
}
} catch (EOFException eof) {
LOG.info("Reached end of dag recovery stream");
break;
} catch (IOException ioe) {
LOG.warn("Corrupt data found when trying to read next event", ioe);
break;
}
if (skipAllOtherEvents) {
// hit an error - skip reading other events
break;
}
HistoryEventType eventType = event.getEventType();
LOG.info("Recovering from event" + ", eventType=" + eventType + ", event=" + event.toString());
switch(eventType) {
case DAG_SUBMITTED:
DAGSubmittedEvent submittedEvent = (DAGSubmittedEvent) event;
recoveredDAGData.recoveredDAG = dagAppMaster.createDAG(submittedEvent.getDAGPlan(), lastInProgressDAG);
recoveredDAGData.cumulativeAdditionalResources = submittedEvent.getCumulativeAdditionalLocalResources();
recoveredDAGData.recoveredDagID = recoveredDAGData.recoveredDAG.getID();
dagAppMaster.setCurrentDAG(recoveredDAGData.recoveredDAG);
if (recoveredDAGData.nonRecoverable) {
skipAllOtherEvents = true;
}
break;
case DAG_INITIALIZED:
recoveredDAGData.dagInitedEvent = (DAGInitializedEvent) event;
break;
case DAG_STARTED:
recoveredDAGData.dagStartedEvent = (DAGStartedEvent) event;
break;
case DAG_FINISHED:
recoveredDAGData.dagFinishedEvent = (DAGFinishedEvent) event;
skipAllOtherEvents = true;
break;
case DAG_COMMIT_STARTED:
case VERTEX_GROUP_COMMIT_STARTED:
case VERTEX_GROUP_COMMIT_FINISHED:
case CONTAINER_LAUNCHED:
{
// Nothing to do for now
break;
}
case DAG_KILL_REQUEST:
{
break;
}
case VERTEX_INITIALIZED:
{
VertexInitializedEvent vertexInitEvent = (VertexInitializedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(vertexInitEvent.getVertexID());
vertexRecoveryData.vertexInitedEvent = vertexInitEvent;
break;
}
case VERTEX_CONFIGURE_DONE:
{
VertexConfigurationDoneEvent reconfigureDoneEvent = (VertexConfigurationDoneEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(reconfigureDoneEvent.getVertexID());
vertexRecoveryData.vertexConfigurationDoneEvent = reconfigureDoneEvent;
break;
}
case VERTEX_STARTED:
{
VertexStartedEvent vertexStartedEvent = (VertexStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(vertexStartedEvent.getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "No VertexInitializedEvent before VertexStartedEvent");
vertexRecoveryData.vertexStartedEvent = vertexStartedEvent;
break;
}
case VERTEX_COMMIT_STARTED:
{
break;
}
case VERTEX_FINISHED:
{
VertexFinishedEvent vertexFinishedEvent = (VertexFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.maybeCreateVertexRecoveryData(vertexFinishedEvent.getVertexID());
vertexRecoveryData.vertexFinishedEvent = vertexFinishedEvent;
break;
}
case TASK_STARTED:
{
TaskStartedEvent taskStartedEvent = (TaskStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taskStartedEvent.getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskStartedEvent, its vertex does not exist:" + taskStartedEvent.getTaskID().getVertexID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.maybeCreateTaskRecoveryData(taskStartedEvent.getTaskID());
taskRecoveryData.taskStartedEvent = taskStartedEvent;
break;
}
case TASK_FINISHED:
{
TaskFinishedEvent taskFinishedEvent = (TaskFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taskFinishedEvent.getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskFinishedEvent, its vertex does not exist:" + taskFinishedEvent.getTaskID().getVertexID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.maybeCreateTaskRecoveryData(taskFinishedEvent.getTaskID());
taskRecoveryData.taskFinishedEvent = taskFinishedEvent;
break;
}
case TASK_ATTEMPT_STARTED:
{
TaskAttemptStartedEvent taStartedEvent = (TaskAttemptStartedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taStartedEvent.getTaskAttemptID().getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskAttemptStartedEvent, its vertexId does not exist, taId=" + taStartedEvent.getTaskAttemptID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.taskRecoveryDataMap.get(taStartedEvent.getTaskAttemptID().getTaskID());
Preconditions.checkArgument(taskRecoveryData != null, "Invalid TaskAttemptStartedEvent, its taskId does not exist, taId=" + taStartedEvent.getTaskAttemptID());
TaskAttemptRecoveryData taRecoveryData = taskRecoveryData.maybeCreateTaskAttemptRecoveryData(taStartedEvent.getTaskAttemptID());
taRecoveryData.taStartedEvent = taStartedEvent;
break;
}
case TASK_ATTEMPT_FINISHED:
{
TaskAttemptFinishedEvent taFinishedEvent = (TaskAttemptFinishedEvent) event;
VertexRecoveryData vertexRecoveryData = recoveredDAGData.vertexRecoveryDataMap.get(taFinishedEvent.getTaskAttemptID().getTaskID().getVertexID());
Preconditions.checkArgument(vertexRecoveryData != null, "Invalid TaskAttemtFinishedEvent, its vertexId does not exist, taId=" + taFinishedEvent.getTaskAttemptID());
TaskRecoveryData taskRecoveryData = vertexRecoveryData.taskRecoveryDataMap.get(taFinishedEvent.getTaskAttemptID().getTaskID());
Preconditions.checkArgument(taskRecoveryData != null, "Invalid TaskAttemptFinishedEvent, its taskId does not exist, taId=" + taFinishedEvent.getTaskAttemptID());
TaskAttemptRecoveryData taRecoveryData = taskRecoveryData.maybeCreateTaskAttemptRecoveryData(taFinishedEvent.getTaskAttemptID());
taRecoveryData.taFinishedEvent = taFinishedEvent;
break;
}
default:
throw new RuntimeException("Invalid data found, unknown event type " + eventType);
}
if (LOG.isDebugEnabled()) {
LOG.debug("[DAG RECOVERY]" + " dagId=" + lastInProgressDAG + ", eventType=" + eventType + ", event=" + event.toString());
}
}
dagRecoveryStream.close();
}
recoveredDAGData.checkRecoverableNonSummary();
return recoveredDAGData;
}
Aggregations