use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.
the class DAGImpl method vertexSucceeded.
private boolean vertexSucceeded(Vertex vertex) {
numSuccessfulVertices++;
boolean recoveryFailed = false;
if (!commitAllOutputsOnSuccess && isCommittable()) {
// committing successful outputs immediately. check for shared outputs
List<VertexGroupInfo> groupsList = vertexGroupInfo.get(vertex.getName());
if (groupsList != null) {
List<VertexGroupInfo> commitList = Lists.newArrayListWithCapacity(groupsList.size());
for (VertexGroupInfo groupInfo : groupsList) {
groupInfo.successfulMembers++;
if (groupInfo.groupMembers.size() == groupInfo.successfulMembers && !groupInfo.outputs.isEmpty()) {
// group has outputs and all vertex members are done
LOG.info("All members of group: " + groupInfo.groupName + " are succeeded. Commiting outputs");
commitList.add(groupInfo);
}
}
for (VertexGroupInfo groupInfo : commitList) {
if (recoveryData != null && recoveryData.isVertexGroupCommitted(groupInfo.groupName)) {
LOG.info("VertexGroup was already committed as per recovery" + " data, groupName=" + groupInfo.groupName);
for (String vertexName : groupInfo.groupMembers) {
VertexRecoveryData vertexRecoveryData = recoveryData.getVertexRecoveryData(getVertex(vertexName).getVertexId());
Preconditions.checkArgument(vertexRecoveryData != null, "Vertex Group has been committed" + ", but no VertexRecoveryData found for its vertex " + vertexName);
VertexFinishedEvent vertexFinishedEvent = vertexRecoveryData.getVertexFinishedEvent();
Preconditions.checkArgument(vertexFinishedEvent != null, "Vertex Group has been committed" + ", but no VertexFinishedEvent found in its vertex " + vertexName);
Preconditions.checkArgument(vertexFinishedEvent.getState() == VertexState.SUCCEEDED, "Vertex Group has been committed, but unexpected vertex state of its vertex " + vertexName + ", vertexstate=" + vertexFinishedEvent.getState());
}
continue;
}
groupInfo.commitStarted = true;
final Vertex v = getVertex(groupInfo.groupMembers.iterator().next());
try {
Collection<TezVertexID> vertexIds = getVertexIds(groupInfo.groupMembers);
appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitStartedEvent(dagId, groupInfo.groupName, vertexIds, clock.getTime())));
} catch (IOException e) {
LOG.error("Failed to send commit recovery event to handler", e);
recoveryFailed = true;
}
if (!recoveryFailed) {
for (final String outputName : groupInfo.outputs) {
OutputKey outputKey = new OutputKey(outputName, groupInfo.groupName, true);
CommitCallback groupCommitCallback = new CommitCallback(outputKey);
CallableEvent groupCommitCallableEvent = new CallableEvent(groupCommitCallback) {
public Void call() throws Exception {
OutputCommitter committer = v.getOutputCommitters().get(outputName);
LOG.info("Committing output: " + outputName);
commitOutput(committer);
return null;
}
};
ListenableFuture<Void> groupCommitFuture = appContext.getExecService().submit(groupCommitCallableEvent);
Futures.addCallback(groupCommitFuture, groupCommitCallableEvent.getCallback());
commitFutures.put(outputKey, groupCommitFuture);
}
}
}
}
}
if (recoveryFailed) {
LOG.info("Recovery failure occurred during commit");
enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.COMMIT_FAILURE);
}
return !recoveryFailed;
}
use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.
the class TestRecoveryParser method testRecoverableSummary_VertexGroupInCommitting.
@Test(timeout = 5000)
public void testRecoverableSummary_VertexGroupInCommitting() throws IOException {
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
TezDAGID dagID = TezDAGID.getInstance(appId, 1);
AppContext appContext = mock(AppContext.class);
when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
when(appContext.getClock()).thenReturn(new SystemClock());
when(mockDAGImpl.getID()).thenReturn(dagID);
when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
when(appContext.getApplicationID()).thenReturn(appId);
RecoveryService rService = new RecoveryService(appContext);
Configuration conf = new Configuration();
conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
rService.init(conf);
rService.start();
DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
// write a DAGSubmittedEvent first to initialize summaryStream
rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
// It should be fine to skip other events, just for testing.
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
rService.stop();
DAGRecoveryData dagData = parser.parseRecoveryData();
assertEquals(dagID, dagData.recoveredDagID);
assertTrue(dagData.nonRecoverable);
assertTrue(dagData.reason.contains("Vertex Group Commit was in progress"));
}
use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.
the class TestHistoryEventTimelineConversion method testHandlerExists.
@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
for (HistoryEventType eventType : HistoryEventType.values()) {
HistoryEvent event = null;
switch(eventType) {
case APP_LAUNCHED:
event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
break;
case AM_LAUNCHED:
event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
break;
case AM_STARTED:
event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
break;
case DAG_SUBMITTED:
event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
break;
case DAG_INITIALIZED:
event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
break;
case DAG_STARTED:
event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
break;
case DAG_FINISHED:
event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
break;
case VERTEX_INITIALIZED:
event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
break;
case VERTEX_STARTED:
event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
break;
case VERTEX_CONFIGURE_DONE:
event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
break;
case VERTEX_FINISHED:
event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
break;
case TASK_STARTED:
event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
break;
case TASK_FINISHED:
event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
break;
case TASK_ATTEMPT_STARTED:
event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
break;
case TASK_ATTEMPT_FINISHED:
event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
break;
case CONTAINER_LAUNCHED:
event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
break;
case CONTAINER_STOPPED:
event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
break;
case DAG_COMMIT_STARTED:
event = new DAGCommitStartedEvent();
break;
case VERTEX_COMMIT_STARTED:
event = new VertexCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_STARTED:
event = new VertexGroupCommitStartedEvent();
break;
case VERTEX_GROUP_COMMIT_FINISHED:
event = new VertexGroupCommitFinishedEvent();
break;
case DAG_RECOVERED:
event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
break;
case DAG_KILL_REQUEST:
event = new DAGKillRequestEvent();
break;
default:
Assert.fail("Unhandled event type " + eventType);
}
if (event == null || !event.isHistoryEvent()) {
continue;
}
HistoryEventTimelineConversion.convertToTimelineEntities(event);
}
}
use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.
the class TestRecoveryParser method testRecoverableSummary_VertexGroupFinishCommitting.
@Test(timeout = 5000)
public void testRecoverableSummary_VertexGroupFinishCommitting() throws IOException {
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
TezDAGID dagID = TezDAGID.getInstance(appId, 1);
AppContext appContext = mock(AppContext.class);
when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
when(appContext.getClock()).thenReturn(new SystemClock());
when(mockDAGImpl.getID()).thenReturn(dagID);
when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
when(appContext.getApplicationID()).thenReturn(appId);
RecoveryService rService = new RecoveryService(appContext);
Configuration conf = new Configuration();
conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
rService.init(conf);
rService.start();
DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
// write a DAGSubmittedEvent first to initialize summaryStream
rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
// It should be fine to skip other events, just for testing.
TezVertexID v0 = TezVertexID.getInstance(dagID, 0);
TezVertexID v1 = TezVertexID.getInstance(dagID, 1);
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
// also write VertexFinishedEvent, otherwise it is still non-recoverable
// when checking with non-summary event
rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v0, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v1, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
rService.stop();
DAGRecoveryData dagData = parser.parseRecoveryData();
assertEquals(dagID, dagData.recoveredDagID);
assertFalse(dagData.nonRecoverable);
}
use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.
the class TestRecoveryParser method testRecoverableNonSummary2.
@Test(timeout = 5000)
public void testRecoverableNonSummary2() throws IOException {
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
TezDAGID dagID = TezDAGID.getInstance(appId, 1);
AppContext appContext = mock(AppContext.class);
when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
when(appContext.getClock()).thenReturn(new SystemClock());
when(mockDAGImpl.getID()).thenReturn(dagID);
// MockRecoveryService will skip the non-summary event
MockRecoveryService rService = new MockRecoveryService(appContext);
Configuration conf = new Configuration();
conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
rService.init(conf);
rService.start();
DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
// write a DAGSubmittedEvent first to initialize summaryStream
rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
// It should be fine to skip other events, just for testing.
TezVertexID vertexId = TezVertexID.getInstance(dagID, 0);
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
rService.stop();
DAGRecoveryData dagData = parser.parseRecoveryData();
assertTrue(dagData.nonRecoverable);
assertTrue(dagData.reason.contains("Vertex has been committed as member of vertex group" + ", but its full recovery events are not seen"));
}
Aggregations