Search in sources :

Example 1 with VertexGroupCommitStartedEvent

use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.

the class DAGImpl method vertexSucceeded.

private boolean vertexSucceeded(Vertex vertex) {
    numSuccessfulVertices++;
    boolean recoveryFailed = false;
    if (!commitAllOutputsOnSuccess && isCommittable()) {
        // committing successful outputs immediately. check for shared outputs
        List<VertexGroupInfo> groupsList = vertexGroupInfo.get(vertex.getName());
        if (groupsList != null) {
            List<VertexGroupInfo> commitList = Lists.newArrayListWithCapacity(groupsList.size());
            for (VertexGroupInfo groupInfo : groupsList) {
                groupInfo.successfulMembers++;
                if (groupInfo.groupMembers.size() == groupInfo.successfulMembers && !groupInfo.outputs.isEmpty()) {
                    // group has outputs and all vertex members are done
                    LOG.info("All members of group: " + groupInfo.groupName + " are succeeded. Commiting outputs");
                    commitList.add(groupInfo);
                }
            }
            for (VertexGroupInfo groupInfo : commitList) {
                if (recoveryData != null && recoveryData.isVertexGroupCommitted(groupInfo.groupName)) {
                    LOG.info("VertexGroup was already committed as per recovery" + " data, groupName=" + groupInfo.groupName);
                    for (String vertexName : groupInfo.groupMembers) {
                        VertexRecoveryData vertexRecoveryData = recoveryData.getVertexRecoveryData(getVertex(vertexName).getVertexId());
                        Preconditions.checkArgument(vertexRecoveryData != null, "Vertex Group has been committed" + ", but no VertexRecoveryData found for its vertex " + vertexName);
                        VertexFinishedEvent vertexFinishedEvent = vertexRecoveryData.getVertexFinishedEvent();
                        Preconditions.checkArgument(vertexFinishedEvent != null, "Vertex Group has been committed" + ", but no VertexFinishedEvent found in its vertex " + vertexName);
                        Preconditions.checkArgument(vertexFinishedEvent.getState() == VertexState.SUCCEEDED, "Vertex Group has been committed, but unexpected vertex state of its vertex " + vertexName + ", vertexstate=" + vertexFinishedEvent.getState());
                    }
                    continue;
                }
                groupInfo.commitStarted = true;
                final Vertex v = getVertex(groupInfo.groupMembers.iterator().next());
                try {
                    Collection<TezVertexID> vertexIds = getVertexIds(groupInfo.groupMembers);
                    appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitStartedEvent(dagId, groupInfo.groupName, vertexIds, clock.getTime())));
                } catch (IOException e) {
                    LOG.error("Failed to send commit recovery event to handler", e);
                    recoveryFailed = true;
                }
                if (!recoveryFailed) {
                    for (final String outputName : groupInfo.outputs) {
                        OutputKey outputKey = new OutputKey(outputName, groupInfo.groupName, true);
                        CommitCallback groupCommitCallback = new CommitCallback(outputKey);
                        CallableEvent groupCommitCallableEvent = new CallableEvent(groupCommitCallback) {

                            public Void call() throws Exception {
                                OutputCommitter committer = v.getOutputCommitters().get(outputName);
                                LOG.info("Committing output: " + outputName);
                                commitOutput(committer);
                                return null;
                            }
                        };
                        ListenableFuture<Void> groupCommitFuture = appContext.getExecService().submit(groupCommitCallableEvent);
                        Futures.addCallback(groupCommitFuture, groupCommitCallableEvent.getCallback());
                        commitFutures.put(outputKey, groupCommitFuture);
                    }
                }
            }
        }
    }
    if (recoveryFailed) {
        LOG.info("Recovery failure occurred during commit");
        enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.COMMIT_FAILURE);
    }
    return !recoveryFailed;
}
Also used : VertexEventRecoverVertex(org.apache.tez.dag.app.dag.event.VertexEventRecoverVertex) Vertex(org.apache.tez.dag.app.dag.Vertex) OutputCommitter(org.apache.tez.runtime.api.OutputCommitter) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) CallableEvent(org.apache.tez.dag.app.dag.event.CallableEvent) PlanVertexGroupInfo(org.apache.tez.dag.api.records.DAGProtos.PlanVertexGroupInfo) VertexRecoveryData(org.apache.tez.dag.app.RecoveryParser.VertexRecoveryData) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) TezVertexID(org.apache.tez.dag.records.TezVertexID)

Example 2 with VertexGroupCommitStartedEvent

use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.

the class TestRecoveryParser method testRecoverableSummary_VertexGroupInCommitting.

@Test(timeout = 5000)
public void testRecoverableSummary_VertexGroupInCommitting() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    when(mockDAGImpl.getID()).thenReturn(dagID);
    when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
    when(appContext.getApplicationID()).thenReturn(appId);
    RecoveryService rService = new RecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // write a DAGSubmittedEvent first to initialize summaryStream
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    // It should be fine to skip other events, just for testing.
    rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
    rService.stop();
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertEquals(dagID, dagData.recoveredDagID);
    assertTrue(dagData.nonRecoverable);
    assertTrue(dagData.reason.contains("Vertex Group Commit was in progress"));
}
Also used : Path(org.apache.hadoop.fs.Path) RecoveryService(org.apache.tez.dag.history.recovery.RecoveryService) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Configuration(org.apache.hadoop.conf.Configuration) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) TezDAGID(org.apache.tez.dag.records.TezDAGID) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 3 with VertexGroupCommitStartedEvent

use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.

the class TestHistoryEventTimelineConversion method testHandlerExists.

@Test(timeout = 5000)
public void testHandlerExists() throws JSONException {
    for (HistoryEventType eventType : HistoryEventType.values()) {
        HistoryEvent event = null;
        switch(eventType) {
            case APP_LAUNCHED:
                event = new AppLaunchedEvent(applicationId, random.nextInt(), random.nextInt(), user, new Configuration(false), null);
                break;
            case AM_LAUNCHED:
                event = new AMLaunchedEvent(applicationAttemptId, random.nextInt(), random.nextInt(), user);
                break;
            case AM_STARTED:
                event = new AMStartedEvent(applicationAttemptId, random.nextInt(), user);
                break;
            case DAG_SUBMITTED:
                event = new DAGSubmittedEvent(tezDAGID, random.nextInt(), dagPlan, applicationAttemptId, null, user, null, containerLogs, null);
                break;
            case DAG_INITIALIZED:
                event = new DAGInitializedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName(), null);
                break;
            case DAG_STARTED:
                event = new DAGStartedEvent(tezDAGID, random.nextInt(), user, dagPlan.getName());
                break;
            case DAG_FINISHED:
                event = new DAGFinishedEvent(tezDAGID, random.nextInt(), random.nextInt(), DAGState.ERROR, null, null, user, dagPlan.getName(), null, applicationAttemptId, dagPlan);
                break;
            case VERTEX_INITIALIZED:
                event = new VertexInitializedEvent(tezVertexID, "v1", random.nextInt(), random.nextInt(), random.nextInt(), "proc", null, null, null);
                break;
            case VERTEX_STARTED:
                event = new VertexStartedEvent(tezVertexID, random.nextInt(), random.nextInt());
                break;
            case VERTEX_CONFIGURE_DONE:
                event = new VertexConfigurationDoneEvent(tezVertexID, 0L, 1, null, null, null, true);
                break;
            case VERTEX_FINISHED:
                event = new VertexFinishedEvent(tezVertexID, "v1", 1, random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), random.nextInt(), VertexState.ERROR, null, null, null, null, null);
                break;
            case TASK_STARTED:
                event = new TaskStartedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt());
                break;
            case TASK_FINISHED:
                event = new TaskFinishedEvent(tezTaskID, "v1", random.nextInt(), random.nextInt(), tezTaskAttemptID, TaskState.FAILED, null, null, 0);
                break;
            case TASK_ATTEMPT_STARTED:
                event = new TaskAttemptStartedEvent(tezTaskAttemptID, "v1", random.nextInt(), containerId, nodeId, null, null, "nodeHttpAddress");
                break;
            case TASK_ATTEMPT_FINISHED:
                event = new TaskAttemptFinishedEvent(tezTaskAttemptID, "v1", random.nextInt(), random.nextInt(), TaskAttemptState.FAILED, TaskFailureType.NON_FATAL, TaskAttemptTerminationCause.OUTPUT_LOST, null, null, null, null, 0, null, 0, containerId, nodeId, null, null, "nodeHttpAddress");
                break;
            case CONTAINER_LAUNCHED:
                event = new ContainerLaunchedEvent(containerId, random.nextInt(), applicationAttemptId);
                break;
            case CONTAINER_STOPPED:
                event = new ContainerStoppedEvent(containerId, random.nextInt(), -1, applicationAttemptId);
                break;
            case DAG_COMMIT_STARTED:
                event = new DAGCommitStartedEvent();
                break;
            case VERTEX_COMMIT_STARTED:
                event = new VertexCommitStartedEvent();
                break;
            case VERTEX_GROUP_COMMIT_STARTED:
                event = new VertexGroupCommitStartedEvent();
                break;
            case VERTEX_GROUP_COMMIT_FINISHED:
                event = new VertexGroupCommitFinishedEvent();
                break;
            case DAG_RECOVERED:
                event = new DAGRecoveredEvent(applicationAttemptId, tezDAGID, dagPlan.getName(), user, random.nextLong(), containerLogs);
                break;
            case DAG_KILL_REQUEST:
                event = new DAGKillRequestEvent();
                break;
            default:
                Assert.fail("Unhandled event type " + eventType);
        }
        if (event == null || !event.isHistoryEvent()) {
            continue;
        }
        HistoryEventTimelineConversion.convertToTimelineEntities(event);
    }
}
Also used : DAGCommitStartedEvent(org.apache.tez.dag.history.events.DAGCommitStartedEvent) Configuration(org.apache.hadoop.conf.Configuration) VertexInitializedEvent(org.apache.tez.dag.history.events.VertexInitializedEvent) HistoryEventType(org.apache.tez.dag.history.HistoryEventType) DAGInitializedEvent(org.apache.tez.dag.history.events.DAGInitializedEvent) ContainerStoppedEvent(org.apache.tez.dag.history.events.ContainerStoppedEvent) DAGKillRequestEvent(org.apache.tez.dag.history.events.DAGKillRequestEvent) DAGStartedEvent(org.apache.tez.dag.history.events.DAGStartedEvent) VertexConfigurationDoneEvent(org.apache.tez.dag.history.events.VertexConfigurationDoneEvent) DAGRecoveredEvent(org.apache.tez.dag.history.events.DAGRecoveredEvent) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent) AMStartedEvent(org.apache.tez.dag.history.events.AMStartedEvent) VertexStartedEvent(org.apache.tez.dag.history.events.VertexStartedEvent) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) HistoryEvent(org.apache.tez.dag.history.HistoryEvent) TaskStartedEvent(org.apache.tez.dag.history.events.TaskStartedEvent) TaskAttemptStartedEvent(org.apache.tez.dag.history.events.TaskAttemptStartedEvent) AppLaunchedEvent(org.apache.tez.dag.history.events.AppLaunchedEvent) TaskFinishedEvent(org.apache.tez.dag.history.events.TaskFinishedEvent) VertexGroupCommitFinishedEvent(org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent) AMLaunchedEvent(org.apache.tez.dag.history.events.AMLaunchedEvent) ContainerLaunchedEvent(org.apache.tez.dag.history.events.ContainerLaunchedEvent) DAGFinishedEvent(org.apache.tez.dag.history.events.DAGFinishedEvent) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent) VertexCommitStartedEvent(org.apache.tez.dag.history.events.VertexCommitStartedEvent) Test(org.junit.Test)

Example 4 with VertexGroupCommitStartedEvent

use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.

the class TestRecoveryParser method testRecoverableSummary_VertexGroupFinishCommitting.

@Test(timeout = 5000)
public void testRecoverableSummary_VertexGroupFinishCommitting() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    when(mockDAGImpl.getID()).thenReturn(dagID);
    when(appContext.getHadoopShim()).thenReturn(new DefaultHadoopShim());
    when(appContext.getApplicationID()).thenReturn(appId);
    RecoveryService rService = new RecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // write a DAGSubmittedEvent first to initialize summaryStream
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    // It should be fine to skip other events, just for testing.
    TezVertexID v0 = TezVertexID.getInstance(dagID, 0);
    TezVertexID v1 = TezVertexID.getInstance(dagID, 1);
    rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
    rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(v0, v1), 0L)));
    // also write VertexFinishedEvent, otherwise it is still non-recoverable
    // when checking with non-summary event
    rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v0, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
    rService.handle(new DAGHistoryEvent(dagID, new VertexFinishedEvent(v1, "v1", 10, 0L, 0L, 0L, 0L, 0L, VertexState.SUCCEEDED, "", null, null, null, null)));
    rService.stop();
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertEquals(dagID, dagData.recoveredDagID);
    assertFalse(dagData.nonRecoverable);
}
Also used : Path(org.apache.hadoop.fs.Path) RecoveryService(org.apache.tez.dag.history.recovery.RecoveryService) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Configuration(org.apache.hadoop.conf.Configuration) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) VertexGroupCommitFinishedEvent(org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent) TezDAGID(org.apache.tez.dag.records.TezDAGID) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) TezVertexID(org.apache.tez.dag.records.TezVertexID) VertexFinishedEvent(org.apache.tez.dag.history.events.VertexFinishedEvent) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 5 with VertexGroupCommitStartedEvent

use of org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent in project tez by apache.

the class TestRecoveryParser method testRecoverableNonSummary2.

@Test(timeout = 5000)
public void testRecoverableNonSummary2() throws IOException {
    ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
    TezDAGID dagID = TezDAGID.getInstance(appId, 1);
    AppContext appContext = mock(AppContext.class);
    when(appContext.getCurrentRecoveryDir()).thenReturn(new Path(recoveryPath + "/1"));
    when(appContext.getClock()).thenReturn(new SystemClock());
    when(mockDAGImpl.getID()).thenReturn(dagID);
    // MockRecoveryService will skip the non-summary event
    MockRecoveryService rService = new MockRecoveryService(appContext);
    Configuration conf = new Configuration();
    conf.setBoolean(RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true);
    rService.init(conf);
    rService.start();
    DAGPlan dagPlan = TestDAGImpl.createTestDAGPlan();
    // write a DAGSubmittedEvent first to initialize summaryStream
    rService.handle(new DAGHistoryEvent(dagID, new DAGSubmittedEvent(dagID, 1L, dagPlan, ApplicationAttemptId.newInstance(appId, 1), null, "user", new Configuration(), null, null)));
    // It should be fine to skip other events, just for testing.
    TezVertexID vertexId = TezVertexID.getInstance(dagID, 0);
    rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitStartedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
    rService.handle(new DAGHistoryEvent(dagID, new VertexGroupCommitFinishedEvent(dagID, "group_1", Lists.newArrayList(TezVertexID.getInstance(dagID, 0), TezVertexID.getInstance(dagID, 1)), 0L)));
    rService.stop();
    DAGRecoveryData dagData = parser.parseRecoveryData();
    assertTrue(dagData.nonRecoverable);
    assertTrue(dagData.reason.contains("Vertex has been committed as member of vertex group" + ", but its full recovery events are not seen"));
}
Also used : Path(org.apache.hadoop.fs.Path) SystemClock(org.apache.hadoop.yarn.util.SystemClock) Configuration(org.apache.hadoop.conf.Configuration) VertexGroupCommitStartedEvent(org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) DAGPlan(org.apache.tez.dag.api.records.DAGProtos.DAGPlan) VertexGroupCommitFinishedEvent(org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent) TezDAGID(org.apache.tez.dag.records.TezDAGID) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DAGRecoveryData(org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData) TezVertexID(org.apache.tez.dag.records.TezVertexID) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Aggregations

VertexGroupCommitStartedEvent (org.apache.tez.dag.history.events.VertexGroupCommitStartedEvent)7 DAGSubmittedEvent (org.apache.tez.dag.history.events.DAGSubmittedEvent)6 Configuration (org.apache.hadoop.conf.Configuration)5 VertexFinishedEvent (org.apache.tez.dag.history.events.VertexFinishedEvent)5 VertexGroupCommitFinishedEvent (org.apache.tez.dag.history.events.VertexGroupCommitFinishedEvent)5 DAGHistoryEvent (org.apache.tez.dag.history.DAGHistoryEvent)4 Path (org.apache.hadoop.fs.Path)3 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)3 SystemClock (org.apache.hadoop.yarn.util.SystemClock)3 DAGPlan (org.apache.tez.dag.api.records.DAGProtos.DAGPlan)3 DAGRecoveryData (org.apache.tez.dag.app.RecoveryParser.DAGRecoveryData)3 HistoryEvent (org.apache.tez.dag.history.HistoryEvent)3 HistoryEventType (org.apache.tez.dag.history.HistoryEventType)3 AMLaunchedEvent (org.apache.tez.dag.history.events.AMLaunchedEvent)3 AMStartedEvent (org.apache.tez.dag.history.events.AMStartedEvent)3 ContainerLaunchedEvent (org.apache.tez.dag.history.events.ContainerLaunchedEvent)3 ContainerStoppedEvent (org.apache.tez.dag.history.events.ContainerStoppedEvent)3 DAGCommitStartedEvent (org.apache.tez.dag.history.events.DAGCommitStartedEvent)3 DAGFinishedEvent (org.apache.tez.dag.history.events.DAGFinishedEvent)3 DAGInitializedEvent (org.apache.tez.dag.history.events.DAGInitializedEvent)3