Search in sources :

Example 96 with TezDAGID

use of org.apache.tez.dag.records.TezDAGID in project tez by apache.

the class RecoveryService method handleRecoveryEvent.

@VisibleForTesting
protected void handleRecoveryEvent(DAGHistoryEvent event) throws IOException {
    HistoryEventType eventType = event.getHistoryEvent().getEventType();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Handling recovery event of type " + event.getHistoryEvent().getEventType());
    }
    TezDAGID dagID = event.getDagID();
    if (completedDAGs.contains(dagID)) {
        // no need to recover completed DAGs
        if (LOG.isDebugEnabled()) {
            LOG.debug("Skipping Recovery Event as DAG completed" + ", dagId=" + dagID + ", completed=" + completedDAGs.contains(dagID) + ", skipped=" + skippedDAGs.contains(dagID) + ", eventType=" + eventType);
        }
        return;
    }
    if (!outputStreamMap.containsKey(dagID)) {
        Path dagFilePath = TezCommonUtils.getDAGRecoveryPath(recoveryPath, dagID.toString());
        FSDataOutputStream outputStream;
        if (recoveryDirFS.exists(dagFilePath)) {
            createFatalErrorFlagDir();
            return;
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Opening DAG recovery file in create mode" + ", filePath=" + dagFilePath);
            }
            outputStream = recoveryDirFS.create(dagFilePath, false, bufferSize);
        }
        outputStreamMap.put(dagID, outputStream);
    }
    FSDataOutputStream outputStream = outputStreamMap.get(dagID);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Writing recovery event to output stream" + ", dagId=" + dagID + ", eventType=" + eventType);
    }
    ++unflushedEventsCount;
    outputStream.writeInt(event.getHistoryEvent().getEventType().ordinal());
    event.getHistoryEvent().toProtoStream(outputStream);
    if (!EnumSet.of(HistoryEventType.DAG_SUBMITTED, HistoryEventType.DAG_FINISHED).contains(eventType)) {
        maybeFlush(outputStream);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TezDAGID(org.apache.tez.dag.records.TezDAGID) HistoryEventType(org.apache.tez.dag.history.HistoryEventType) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 97 with TezDAGID

use of org.apache.tez.dag.records.TezDAGID in project tez by apache.

the class RecoveryService method handle.

public void handle(DAGHistoryEvent event) throws IOException {
    if (stopped.get()) {
        LOG.warn("Igoring event as service stopped, eventType" + event.getHistoryEvent().getEventType());
        return;
    }
    HistoryEventType eventType = event.getHistoryEvent().getEventType();
    if (recoveryFatalErrorOccurred.get()) {
        return;
    }
    if (!started.get()) {
        LOG.warn("Adding event of type " + eventType + " to queue as service not started");
        addToEventQueue(event);
        return;
    }
    TezDAGID dagId = event.getDagID();
    if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
        DAGSubmittedEvent dagSubmittedEvent = (DAGSubmittedEvent) event.getHistoryEvent();
        String dagName = dagSubmittedEvent.getDAGName();
        if (dagName != null && dagName.startsWith(TezConstants.TEZ_PREWARM_DAG_NAME_PREFIX)) {
            // Skip recording pre-warm DAG events
            skippedDAGs.add(dagId);
            return;
        }
    }
    if (dagId == null || skippedDAGs.contains(dagId)) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Skipping event for DAG" + ", eventType=" + eventType + ", dagId=" + (dagId == null ? "null" : dagId.toString()) + ", isSkippedDAG=" + (dagId == null ? "null" : skippedDAGs.contains(dagId)));
        }
        return;
    }
    if (event.getHistoryEvent() instanceof SummaryEvent) {
        synchronized (lock) {
            if (stopped.get()) {
                LOG.warn("Igoring event as service stopped, eventType" + event.getHistoryEvent().getEventType());
                return;
            }
            try {
                SummaryEvent summaryEvent = (SummaryEvent) event.getHistoryEvent();
                handleSummaryEvent(dagId, eventType, summaryEvent);
                if (summaryEvent.writeToRecoveryImmediately()) {
                    handleRecoveryEvent(event);
                    // outputStream may already be closed and removed
                    if (outputStreamMap.containsKey(event.getDagID())) {
                        doFlush(outputStreamMap.get(event.getDagID()), appContext.getClock().getTime());
                    }
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Queueing Non-immediate Summary/Recovery event of type" + eventType.name());
                    }
                    addToEventQueue(event);
                }
                if (eventType.equals(HistoryEventType.DAG_FINISHED)) {
                    LOG.info("DAG completed" + ", dagId=" + event.getDagID() + ", queueSize=" + eventQueue.size());
                    completedDAGs.add(dagId);
                    if (outputStreamMap.containsKey(dagId)) {
                        try {
                            outputStreamMap.get(dagId).close();
                            outputStreamMap.remove(dagId);
                        } catch (IOException ioe) {
                            LOG.warn("Error when trying to flush/close recovery file for" + " dag, dagId=" + event.getDagID());
                        }
                    }
                }
            } catch (IOException ioe) {
                LOG.error("Error handling summary event" + ", eventType=" + event.getHistoryEvent().getEventType(), ioe);
                createFatalErrorFlagDir();
                if (eventType.equals(HistoryEventType.DAG_SUBMITTED)) {
                    // Throw error to tell client that dag submission failed
                    throw ioe;
                }
            }
        }
    } else {
        // All other events just get queued
        if (LOG.isDebugEnabled()) {
            LOG.debug("Queueing Non-Summary Recovery event of type " + eventType.name());
        }
        addToEventQueue(event);
    }
}
Also used : TezDAGID(org.apache.tez.dag.records.TezDAGID) SummaryEvent(org.apache.tez.dag.history.SummaryEvent) HistoryEventType(org.apache.tez.dag.history.HistoryEventType) IOException(java.io.IOException) DAGSubmittedEvent(org.apache.tez.dag.history.events.DAGSubmittedEvent)

Example 98 with TezDAGID

use of org.apache.tez.dag.records.TezDAGID in project tez by apache.

the class HistoryEventHandler method shouldLogTaskAttemptEvents.

// If the log level is set to TASK_ATTEMPT and filters are configured, then we should suppress
// the start event and publish it only when TaskAttemptFinishedEvent is received after
// matching against the filter.
// Note: if the AM is killed before we get the TaskAttemptFinishedEvent, we'll lose this event.
private boolean shouldLogTaskAttemptEvents(DAGHistoryEvent event, HistoryLogLevel dagLogLevel) {
    HistoryEvent historyEvent = event.getHistoryEvent();
    HistoryEventType eventType = historyEvent.getEventType();
    if (dagLogLevel == HistoryLogLevel.TASK_ATTEMPT && (eventType == HistoryEventType.TASK_ATTEMPT_STARTED || eventType == HistoryEventType.TASK_ATTEMPT_FINISHED)) {
        TezDAGID dagId = event.getDagID();
        Set<TaskAttemptTerminationCause> filters = null;
        if (dagId != null) {
            filters = dagIdToTaskAttemptFilters.get(dagId);
        }
        if (filters == null) {
            filters = amTaskAttemptFilters;
        }
        if (filters == null) {
            return true;
        }
        if (eventType == HistoryEventType.TASK_ATTEMPT_STARTED) {
            suppressedEvents.put(((TaskAttemptStartedEvent) historyEvent).getTaskAttemptID(), event);
            return false;
        } else {
            // TaskAttemptFinishedEvent
            TaskAttemptFinishedEvent finishedEvent = (TaskAttemptFinishedEvent) historyEvent;
            if (filters.contains(finishedEvent.getTaskAttemptError())) {
                suppressedEvents.remove(finishedEvent.getTaskAttemptID());
                return false;
            }
        }
    }
    return true;
}
Also used : TezDAGID(org.apache.tez.dag.records.TezDAGID) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)

Example 99 with TezDAGID

use of org.apache.tez.dag.records.TezDAGID in project tez by apache.

the class TestATSV15HistoryLoggingService method testSessionDomainsDagFailed.

@Test
public void testSessionDomainsDagFailed() throws Exception {
    ATSV15HistoryLoggingService service = createService(-1);
    when(appContext.isSession()).thenReturn(true);
    HistoryACLPolicyManager historyACLPolicyManager = mock(HistoryACLPolicyManager.class);
    service.historyACLPolicyManager = historyACLPolicyManager;
    when(historyACLPolicyManager.setupSessionACLs((Configuration) any(), eq(appId))).thenReturn(Collections.singletonMap(TezConfiguration.YARN_ATS_ACL_SESSION_DOMAIN_ID, "session-id"));
    service.start();
    // Verify that the session domain creation was called.
    verify(historyACLPolicyManager, times(1)).setupSessionACLs((Configuration) any(), eq(appId));
    // Mock dag domain creation.
    when(historyACLPolicyManager.setupSessionDAGACLs((Configuration) any(), eq(appId), eq("0"), (DAGAccessControls) any())).thenThrow(new IOException());
    // Send the event and wait for completion.
    TezDAGID dagId1 = TezDAGID.getInstance(appId, 0);
    for (DAGHistoryEvent event : makeHistoryEvents(dagId1, service)) {
        service.handle(event);
    }
    while (!service.eventQueue.isEmpty()) {
        Thread.sleep(100);
    }
    // Verify dag domain creation was called.
    verify(historyACLPolicyManager, times(1)).setupSessionDAGACLs((Configuration) any(), eq(appId), eq("0"), (DAGAccessControls) any());
    // AM events sent, dag events are not sent.
    verify(historyACLPolicyManager, times(1)).updateTimelineEntityDomain(any(), eq("session-id"));
    verify(historyACLPolicyManager, times(0)).updateTimelineEntityDomain(any(), eq("dag-id"));
    assertEquals(1, entityLog.size());
    service.stop();
}
Also used : HistoryACLPolicyManager(org.apache.tez.common.security.HistoryACLPolicyManager) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) IOException(java.io.IOException) Test(org.junit.Test)

Example 100 with TezDAGID

use of org.apache.tez.dag.records.TezDAGID in project tez by apache.

the class TestATSV15HistoryLoggingService method testSessionDomains.

@Test
public void testSessionDomains() throws Exception {
    ATSV15HistoryLoggingService service = createService(-1);
    when(appContext.isSession()).thenReturn(true);
    HistoryACLPolicyManager historyACLPolicyManager = mock(HistoryACLPolicyManager.class);
    service.historyACLPolicyManager = historyACLPolicyManager;
    when(historyACLPolicyManager.setupSessionACLs((Configuration) any(), eq(appId))).thenReturn(Collections.singletonMap(TezConfiguration.YARN_ATS_ACL_SESSION_DOMAIN_ID, "session-id"));
    service.start();
    // Verify that the session domain was created.
    verify(historyACLPolicyManager, times(1)).setupSessionACLs((Configuration) any(), eq(appId));
    // Mock dag domain creation.
    when(historyACLPolicyManager.setupSessionDAGACLs((Configuration) any(), eq(appId), eq("0"), (DAGAccessControls) any())).thenReturn(Collections.singletonMap(TezConfiguration.YARN_ATS_ACL_DAG_DOMAIN_ID, "dag-id"));
    // Send the event and wait for completion.
    TezDAGID dagId1 = TezDAGID.getInstance(appId, 0);
    for (DAGHistoryEvent event : makeHistoryEvents(dagId1, service)) {
        service.handle(event);
    }
    while (!service.eventQueue.isEmpty()) {
        Thread.sleep(100);
    }
    // Verify dag domain was created.
    verify(historyACLPolicyManager, times(1)).setupSessionDAGACLs((Configuration) any(), eq(appId), eq("0"), (DAGAccessControls) any());
    // calls were made with correct domain ids.
    verify(historyACLPolicyManager, times(1)).updateTimelineEntityDomain(any(), eq("session-id"));
    verify(historyACLPolicyManager, times(5)).updateTimelineEntityDomain(any(), eq("dag-id"));
    service.stop();
}
Also used : HistoryACLPolicyManager(org.apache.tez.common.security.HistoryACLPolicyManager) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGHistoryEvent(org.apache.tez.dag.history.DAGHistoryEvent) Test(org.junit.Test)

Aggregations

TezDAGID (org.apache.tez.dag.records.TezDAGID)124 Test (org.junit.Test)75 TezVertexID (org.apache.tez.dag.records.TezVertexID)61 Configuration (org.apache.hadoop.conf.Configuration)55 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)53 DAGHistoryEvent (org.apache.tez.dag.history.DAGHistoryEvent)50 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)43 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)43 TezTaskID (org.apache.tez.dag.records.TezTaskID)33 SystemClock (org.apache.hadoop.yarn.util.SystemClock)32 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)31 Container (org.apache.hadoop.yarn.api.records.Container)30 Resource (org.apache.hadoop.yarn.api.records.Resource)29 TaskCommunicatorManagerInterface (org.apache.tez.dag.app.TaskCommunicatorManagerInterface)29 ClusterInfo (org.apache.tez.dag.app.ClusterInfo)28 AMContainerMap (org.apache.tez.dag.app.rm.container.AMContainerMap)28 ContainerHeartbeatHandler (org.apache.tez.dag.app.ContainerHeartbeatHandler)27 ContainerContextMatcher (org.apache.tez.dag.app.rm.container.ContainerContextMatcher)27 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)25 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)25