use of org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent in project tez by apache.
the class DAGImpl method commitCompleted.
private boolean commitCompleted(DAGEventCommitCompleted commitCompletedEvent) {
Preconditions.checkState(commitFutures.remove(commitCompletedEvent.getOutputKey()) != null, "Unknown commit:" + commitCompletedEvent.getOutputKey());
boolean commitFailed = false;
boolean recoveryFailed = false;
if (commitCompletedEvent.isSucceeded()) {
LOG.info("Commit succeeded for output:" + commitCompletedEvent.getOutputKey());
OutputKey outputKey = commitCompletedEvent.getOutputKey();
if (outputKey.isVertexGroupOutput) {
VertexGroupInfo vertexGroup = vertexGroups.get(outputKey.getEntityName());
vertexGroup.successfulCommits++;
if (vertexGroup.isCommitted()) {
if (!commitAllOutputsOnSuccess) {
try {
Collection<TezVertexID> vertexIds = getVertexIds(vertexGroup.groupMembers);
appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitFinishedEvent(getID(), commitCompletedEvent.getOutputKey().getEntityName(), vertexIds, clock.getTime())));
} catch (IOException e) {
String diag = "Failed to send commit recovery event to handler, " + ExceptionUtils.getStackTrace(e);
addDiagnostic(diag);
LOG.error(diag);
recoveryFailed = true;
}
}
}
}
} else {
String diag = "Commit failed for output: " + commitCompletedEvent.getOutputKey() + ", " + ExceptionUtils.getStackTrace(commitCompletedEvent.getException());
addDiagnostic(diag);
LOG.error(diag);
commitFailed = true;
}
if (commitFailed) {
enactKill(DAGTerminationCause.COMMIT_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
cancelCommits();
}
if (recoveryFailed) {
enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
cancelCommits();
}
return !commitFailed && !recoveryFailed;
}
use of org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
HistoryEventTimelineConversion.convertToTimelineEntities(event);
}
}
use of org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent in project tez by apache.
the class TestRecoveryParser method testRecoverableSummary_VertexGroupFinishCommitting.
@Test(timeout = 5000)
public void testRecoverableSummary_VertexGroupFinishCommitting() throws IOException {
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
TezDAGID dagID = TezDAGID.getInstance(appId, 1);
AppContext appContext = mock(AppContext.class);
when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
when(appContext.getClock()).thenReturn(new SystemClock());
when(mockDAGImpl.getID()).thenReturn(dagID);
when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
when(appContext.getApplicationID()).thenReturn(appId);
RecoveryService rService = new RecoveryService(appContext);
Configuration conf = new Configuration();
conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
rService.init(conf);
rService.start();
DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
// write a DAGSubmittedEvent first to initialize summaryStream
rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
// It should be fine to skip other events, just for testing.
TezVertexID v0 = TezVertexID.getInstance(dagID, 0);
TezVertexID v1 = TezVertexID.getInstance(dagID, 1);
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
// also write VertexFinishedEvent, otherwise it is still non-recoverable
// when checking with non-summary event
rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v0, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v1, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
rService.stop();
DAGRecoveryData dagData = parser.parseRecoveryData();
assertEquals(dagID, dagData.recoveredDagID);
assertFalse(dagData.nonRecoverable);
}
use of org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent in project tez by apache.
the class TestRecoveryParser method testRecoverableNonSummary2.
@Test(timeout = 5000)
public void testRecoverableNonSummary2() throws IOException {
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
TezDAGID dagID = TezDAGID.getInstance(appId, 1);
AppContext appContext = mock(AppContext.class);
when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
when(appContext.getClock()).thenReturn(new SystemClock());
when(mockDAGImpl.getID()).thenReturn(dagID);
// MockRecoveryService will skip the non-summary event
MockRecoveryService rService = new MockRecoveryService(appContext);
Configuration conf = new Configuration();
conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
rService.init(conf);
rService.start();
DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
// write a DAGSubmittedEvent first to initialize summaryStream
rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
// It should be fine to skip other events, just for testing.
TezVertexID vertexId = TezVertexID.getInstance(dagID, 0);
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
rService.stop();
DAGRecoveryData dagData = parser.parseRecoveryData();
assertTrue(dagData.nonRecoverable);
assertTrue(dagData.reason.contains("Vertex has been committed as member of vertex group" + ", but its full recovery events are not seen"));
}
use of org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent in project tez by apache.
the class TestHistoryEventJsonConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, null, "Q_" + eventType.name());
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.KILLED, null, TaskAttemptTerminationCause.TERMINATED_BY_CLIENT, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, 1l, null);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
JSONObject json = HistoryEventJsonConversion.convertToJson(event);
if (eventType == HistoryEventType.DAG_SUBMITTED) {
try {
Assert.assertEquals("Q_" + eventType.name(), json.getJSONObject(ATSConstants.OTHER_INFO).getString(ATSConstants.DAG_QUEUE_NAME));
Assert.assertEquals("Q_" + eventType.name(), json.getJSONObject(ATSConstants.PRIMARY_FILTERS).getString(ATSConstants.DAG_QUEUE_NAME));
} catch (JSONException ex) {
Assert.fail("Exception: " + ex.getMessage() + " for type: " + eventType);
}
}
}
}
Aggregations