Search in sources :

Example 1 with TaskAttemptState

use of org.apache.tez.dag.api.oldrecords.TaskAttemptState in project tez by apache.

the class TaskImpl method canCommit.

@Override
public boolean canCommit(TezTaskAttemptID taskAttemptID) {
    writeLock.lock();
    try {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Commit go/no-go request from " + taskAttemptID);
        }
        TaskState state = getState();
        if (state == TaskState.SCHEDULED) {
            // the actual running task ran and is done and asking for commit. we are still stuck
            // in the scheduled state which indicates a backlog in event processing. lets wait for the
            // backlog to clear. returning false will make the attempt come back to us.
            LOG.info("Event processing delay. " + "Attempt committing before state machine transitioned to running : Task {}", taskId);
            return false;
        }
        // have been in scheduled state in task impl.
        if (state != TaskState.RUNNING) {
            LOG.info("Task not running. Issuing kill to bad commit attempt " + taskAttemptID);
            eventHandler.handle(new TaskAttemptEventKillRequest(taskAttemptID, "Task not running. Bad attempt.", TaskAttemptTerminationCause.TERMINATED_ORPHANED));
            return false;
        }
        if (commitAttempt == null) {
            TaskAttempt ta = getAttempt(taskAttemptID);
            if (ta == null) {
                throw new TezUncheckedException("Unknown task for commit: " + taskAttemptID);
            }
            // Its ok to get a non-locked state snapshot since we handle changes of
            // state in the task attempt. Dont want to deadlock here.
            TaskAttemptState taState = ta.getStateNoLock();
            if (taState == TaskAttemptState.RUNNING) {
                commitAttempt = taskAttemptID;
                LOG.info(taskAttemptID + " given a go for committing the task output.");
                return true;
            } else {
                LOG.info(taskAttemptID + " with state: " + taState + " given a no-go for commit because its not running.");
                return false;
            }
        } else {
            if (commitAttempt.equals(taskAttemptID)) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(taskAttemptID + " already given a go for committing the task output.");
                }
                return true;
            }
            // succeeds then this and others will be killed
            if (LOG.isDebugEnabled()) {
                LOG.debug(commitAttempt + " is current committer. Commit waiting for:  " + taskAttemptID);
            }
            return false;
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TaskAttemptState(org.apache.tez.dag.api.oldrecords.TaskAttemptState) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TaskState(org.apache.tez.dag.api.oldrecords.TaskState) TaskAttemptEventKillRequest(org.apache.tez.dag.app.dag.event.TaskAttemptEventKillRequest)

Example 2 with TaskAttemptState

use of org.apache.tez.dag.api.oldrecords.TaskAttemptState in project tez by apache.

the class TestHistoryEventTimelineConversion method testConvertTaskAttemptFinishedEvent.

@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testConvertTaskAttemptFinishedEvent() {
    String vertexName = "testVertex";
    long creationTime = random.nextLong();
    long startTime = creationTime + 1000;
    long allocationTime = creationTime + 1001;
    long finishTime = startTime + 1002;
    TaskAttemptState state = TaskAttemptState.values()[random.nextInt(TaskAttemptState.values().length)];
    TaskAttemptTerminationCause error = TaskAttemptTerminationCause.values()[random.nextInt(TaskAttemptTerminationCause.values().length)];
    String diagnostics = "random diagnostics message";
    TezCounters counters = new TezCounters();
    long lastDataEventTime = finishTime - 1;
    List<DataEventDependencyInfo> events = Lists.newArrayList();
    events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
    events.add(new DataEventDependencyInfo(lastDataEventTime, tezTaskAttemptID));
    TaskAttemptFinishedEvent event = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, TaskFailureType.FATAL, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
    List<TimelineEntity> entities = HistoryEventTimelineConversion.convertToTimelineEntities(event);
    Assert.assertEquals(1, entities.size());
    TimelineEntity timelineEntity = entities.get(0);
    Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getEntityId());
    Assert.assertEquals(EntityTypes.TEZ_TASK_ATTEMPT_ID.name(), timelineEntity.getEntityType());
    final Map<String, Set<Object>> primaryFilters = timelineEntity.getPrimaryFilters();
    Assert.assertEquals(5, primaryFilters.size());
    Assert.assertTrue(primaryFilters.get(ATSConstants.APPLICATION_ID).contains(applicationId.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_DAG_ID.name()).contains(tezDAGID.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_VERTEX_ID.name()).contains(tezVertexID.toString()));
    Assert.assertTrue(primaryFilters.get(EntityTypes.TEZ_TASK_ID.name()).contains(tezTaskID.toString()));
    Assert.assertTrue(primaryFilters.get(ATSConstants.STATUS).contains(state.toString()));
    Assert.assertEquals(1, timelineEntity.getEvents().size());
    TimelineEvent evt = timelineEntity.getEvents().get(0);
    Assert.assertEquals(HistoryEventType.TASK_ATTEMPT_FINISHED.name(), evt.getEventType());
    Assert.assertEquals(finishTime, evt.getTimestamp());
    final Map<String, Object> otherInfo = timelineEntity.getOtherInfo();
    Assert.assertEquals(17, otherInfo.size());
    Assert.assertEquals(tezTaskAttemptID.toString(), timelineEntity.getOtherInfo().get(ATSConstants.CREATION_CAUSAL_ATTEMPT));
    Assert.assertEquals(creationTime, timelineEntity.getOtherInfo().get(ATSConstants.CREATION_TIME));
    Assert.assertEquals(allocationTime, timelineEntity.getOtherInfo().get(ATSConstants.ALLOCATION_TIME));
    Assert.assertEquals(startTime, timelineEntity.getOtherInfo().get(ATSConstants.START_TIME));
    Assert.assertEquals(finishTime, otherInfo.get(ATSConstants.FINISH_TIME));
    Assert.assertEquals(finishTime - startTime, otherInfo.get(ATSConstants.TIME_TAKEN));
    Assert.assertEquals(state.name(), otherInfo.get(ATSConstants.STATUS));
    Assert.assertEquals(TaskFailureType.FATAL.name(), otherInfo.get(ATSConstants.TASK_FAILURE_TYPE));
    Assert.assertEquals(error.name(), otherInfo.get(ATSConstants.TASK_ATTEMPT_ERROR_ENUM));
    Assert.assertEquals(diagnostics, otherInfo.get(ATSConstants.DIAGNOSTICS));
    Map<String, Object> obj1 = (Map<String, Object>) otherInfo.get(ATSConstants.LAST_DATA_EVENTS);
    List<Object> obj2 = (List<Object>) obj1.get(ATSConstants.LAST_DATA_EVENTS);
    Assert.assertEquals(2, obj2.size());
    Map<String, Object> obj3 = (Map<String, Object>) obj2.get(0);
    Assert.assertEquals(events.get(0).getTimestamp(), obj3.get(ATSConstants.TIMESTAMP));
    Assert.assertTrue(otherInfo.containsKey(ATSConstants.COUNTERS));
    Assert.assertEquals("inProgressURL", otherInfo.get(ATSConstants.IN_PROGRESS_LOGS_URL));
    Assert.assertEquals("logsURL", otherInfo.get(ATSConstants.COMPLETED_LOGS_URL));
    Assert.assertEquals(nodeId.toString(), otherInfo.get(ATSConstants.NODE_ID));
    Assert.assertEquals(containerId.toString(), otherInfo.get(ATSConstants.CONTAINER_ID));
    Assert.assertEquals("nodeHttpAddress", otherInfo.get(ATSConstants.NODE_HTTP_ADDRESS));
    TaskAttemptFinishedEvent eventWithNullFailureType = new TaskAttemptFinishedEvent(tezTaskAttemptID, vertexName, startTime, finishTime, state, null, error, diagnostics, counters, events, null, creationTime, tezTaskAttemptID, allocationTime, containerId, nodeId, "inProgressURL", "logsURL", "nodeHttpAddress");
    List<TimelineEntity> evtEntities = HistoryEventTimelineConversion.convertToTimelineEntities(eventWithNullFailureType);
    Assert.assertEquals(1, evtEntities.size());
    TimelineEntity timelineEntityWithNullFailureType = evtEntities.get(0);
    Assert.assertNull(timelineEntityWithNullFailureType.getOtherInfo().get(ATSConstants.TASK_FAILURE_TYPE));
}
Also used : TimelineEvent(org.apache.hadoop.yarn.api.records.timeline.TimelineEvent) Set(java.util.Set) TimelineEntity(org.apache.hadoop.yarn.api.records.timeline.TimelineEntity) TezCounters(org.apache.tez.common.counters.TezCounters) DataEventDependencyInfo(org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.DataEventDependencyInfo) TaskAttemptState(org.apache.tez.dag.api.oldrecords.TaskAttemptState) List(java.util.List) TaskAttemptTerminationCause(org.apache.tez.dag.records.TaskAttemptTerminationCause) TaskAttemptFinishedEvent(org.apache.tez.dag.history.events.TaskAttemptFinishedEvent) Map(java.util.Map) HashMap(java.util.HashMap) Test(org.junit.Test)

Aggregations

TaskAttemptState (org.apache.tez.dag.api.oldrecords.TaskAttemptState)2 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 TimelineEntity (org.apache.hadoop.yarn.api.records.timeline.TimelineEntity)1 TimelineEvent (org.apache.hadoop.yarn.api.records.timeline.TimelineEvent)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)1 TaskState (org.apache.tez.dag.api.oldrecords.TaskState)1 TaskAttempt (org.apache.tez.dag.app.dag.TaskAttempt)1 TaskAttemptEventKillRequest (org.apache.tez.dag.app.dag.event.TaskAttemptEventKillRequest)1 DataEventDependencyInfo (org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.DataEventDependencyInfo)1 TaskAttemptFinishedEvent (org.apache.tez.dag.history.events.TaskAttemptFinishedEvent)1 TaskAttemptTerminationCause (org.apache.tez.dag.records.TaskAttemptTerminationCause)1 Test (org.junit.Test)1