Search in sources :

Example 1 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskImpl method selectBestAttempt.

// select the nextAttemptNumber with best progress
// always called inside the Read Lock
private TaskAttempt selectBestAttempt() {
    float progress = 0f;
    TaskAttempt result = null;
    for (TaskAttempt at : attempts.values()) {
        switch(at.getState()) {
            // ignore all failed task attempts
            case FAILED:
            case KILLED:
                continue;
            default:
        }
        if (result == null) {
            // The first time around
            result = at;
        }
        // calculate the best progress
        float attemptProgress = at.getProgress();
        if (attemptProgress > progress) {
            result = at;
            progress = attemptProgress;
        }
    }
    return result;
}
Also used : TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt)

Example 2 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskSchedulerManager method handleTASucceeded.

private void handleTASucceeded(AMSchedulerEventTAEnded event) {
    TaskAttempt attempt = event.getAttempt();
    ContainerId usedContainerId = event.getUsedContainerId();
    // assigned to it.
    if (event.getUsedContainerId() != null) {
        sendEvent(new AMContainerEventTASucceeded(usedContainerId, event.getAttemptID()));
        sendEvent(new AMNodeEventTaskAttemptSucceeded(appContext.getAllContainers().get(usedContainerId).getContainer().getNodeId(), event.getSchedulerId(), usedContainerId, event.getAttemptID()));
    }
    boolean wasContainerAllocated = false;
    try {
        wasContainerAllocated = taskSchedulers[event.getSchedulerId()].deallocateTask(attempt, true, null, event.getDiagnostics());
    } catch (Exception e) {
        String msg = "Error in TaskScheduler for handling Task De-allocation" + ", eventType=" + event.getType() + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(event.getSchedulerId(), appContext) + ", taskAttemptId=" + attempt.getID();
        LOG.error(msg, e);
        sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
        return;
    }
    if (!wasContainerAllocated) {
        LOG.error("De-allocated successful task: " + attempt.getID() + ", but TaskScheduler reported no container assigned to task");
    }
}
Also used : DAGAppMasterEventUserServiceFatalError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError) AMContainerEventTASucceeded(org.apache.tez.dag.app.rm.container.AMContainerEventTASucceeded) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) AMNodeEventTaskAttemptSucceeded(org.apache.tez.dag.app.rm.node.AMNodeEventTaskAttemptSucceeded) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TezException(org.apache.tez.dag.api.TezException)

Example 3 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class LegacySpeculator method speculationValue.

/*   *************************************************************    */
// This is the code section that runs periodically and adds speculations for
// those jobs that need them.
// This can return a few magic values for tasks that shouldn't speculate:
// returns ON_SCHEDULE if thresholdRuntime(taskID) says that we should not
// considering speculating this task
// returns ALREADY_SPECULATING if that is true.  This has priority.
// returns TOO_NEW if our companion task hasn't gotten any information
// returns PROGRESS_IS_GOOD if the task is sailing through
// returns NOT_RUNNING if the task is not running
// 
// All of these values are negative.  Any value that should be allowed to
// speculate is 0 or positive.
// 
// If shouldUseTimeout is true, we will use timeout to decide on
// speculation instead of the task statistics. This can be useful, for
// example for single task vertices for which there are no tasks to compare
// with
private long speculationValue(Task task, long now, boolean shouldUseTimeout) {
    Map<TezTaskAttemptID, TaskAttempt> attempts = task.getAttempts();
    TezTaskID taskID = task.getTaskId();
    long acceptableRuntime = Long.MIN_VALUE;
    long result = Long.MIN_VALUE;
    // short circuit completed tasks. no need to spend time on them
    if (task.getState() == TaskState.SUCCEEDED) {
        return NOT_RUNNING;
    }
    if (!mayHaveSpeculated.contains(taskID) && !shouldUseTimeout) {
        acceptableRuntime = estimator.thresholdRuntime(taskID);
        if (acceptableRuntime == Long.MAX_VALUE) {
            return ON_SCHEDULE;
        }
    }
    TezTaskAttemptID runningTaskAttemptID = null;
    int numberRunningAttempts = 0;
    for (TaskAttempt taskAttempt : attempts.values()) {
        if (taskAttempt.getState() == TaskAttemptState.RUNNING || taskAttempt.getState() == TaskAttemptState.STARTING) {
            if (++numberRunningAttempts > 1) {
                return ALREADY_SPECULATING;
            }
            runningTaskAttemptID = taskAttempt.getID();
            long taskAttemptStartTime = estimator.attemptEnrolledTime(runningTaskAttemptID);
            if (taskAttemptStartTime > now) {
                // attempt status change that chronicles the attempt start
                return TOO_NEW;
            }
            if (shouldUseTimeout) {
                if ((now - taskAttemptStartTime) > taskTimeout) {
                    // If the task has timed out, then we want to schedule a speculation
                    // immediately. However we cannot return immediately since we may
                    // already have a speculation running.
                    result = Long.MAX_VALUE;
                } else {
                    // Task has not timed out so we are good
                    return ON_SCHEDULE;
                }
            } else {
                long estimatedRunTime = estimator.estimatedRuntime(runningTaskAttemptID);
                long estimatedEndTime = estimatedRunTime + taskAttemptStartTime;
                long estimatedReplacementEndTime = now + estimator.newAttemptEstimatedRuntime();
                float progress = taskAttempt.getProgress();
                TaskAttemptHistoryStatistics data = runningTaskAttemptStatistics.get(runningTaskAttemptID);
                if (data == null) {
                    runningTaskAttemptStatistics.put(runningTaskAttemptID, new TaskAttemptHistoryStatistics(estimatedRunTime, progress, now));
                } else {
                    if (estimatedRunTime == data.getEstimatedRunTime() && progress == data.getProgress()) {
                        // Previous stats are same as same stats
                        if (data.notHeartbeatedInAWhile(now)) {
                            // Stats have stagnated for a while, simulate heart-beat.
                            // Now simulate the heart-beat
                            statusUpdate(taskAttempt.getID(), taskAttempt.getState(), clock.getTime());
                        }
                    } else {
                        // Stats have changed - update our data structure
                        data.setEstimatedRunTime(estimatedRunTime);
                        data.setProgress(progress);
                        data.resetHeartBeatTime(now);
                    }
                }
                if (estimatedEndTime < now) {
                    return PROGRESS_IS_GOOD;
                }
                if (estimatedReplacementEndTime >= estimatedEndTime) {
                    return TOO_LATE_TO_SPECULATE;
                }
                result = estimatedEndTime - estimatedReplacementEndTime;
            }
        }
    }
    // If we are here, there's at most one task attempt.
    if (numberRunningAttempts == 0) {
        return NOT_RUNNING;
    }
    if ((acceptableRuntime == Long.MIN_VALUE) && !shouldUseTimeout) {
        acceptableRuntime = estimator.thresholdRuntime(taskID);
        if (acceptableRuntime == Long.MAX_VALUE) {
            return ON_SCHEDULE;
        }
    }
    return result;
}
Also used : TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TezTaskID(org.apache.tez.dag.records.TezTaskID) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID)

Example 4 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class LegacyTaskRuntimeEstimator method updateAttempt.

@Override
public void updateAttempt(TezTaskAttemptID attemptID, TaskAttemptState state, long timestamp) {
    super.updateAttempt(attemptID, state, timestamp);
    Task task = vertex.getTask(attemptID.getTaskID());
    if (task == null) {
        return;
    }
    TaskAttempt taskAttempt = task.getAttempt(attemptID);
    if (taskAttempt == null) {
        return;
    }
    float progress = taskAttempt.getProgress();
    Long boxedStart = startTimes.get(attemptID);
    long start = boxedStart == null ? Long.MIN_VALUE : boxedStart;
    // 
    if (taskAttempt.getState() == TaskAttemptState.RUNNING) {
        // See if this task is already in the registry
        AtomicLong estimateContainer = attemptRuntimeEstimates.get(taskAttempt);
        AtomicLong estimateVarianceContainer = attemptRuntimeEstimateVariances.get(taskAttempt);
        if (estimateContainer == null) {
            if (attemptRuntimeEstimates.get(taskAttempt) == null) {
                attemptRuntimeEstimates.put(taskAttempt, new AtomicLong());
                estimateContainer = attemptRuntimeEstimates.get(taskAttempt);
            }
        }
        if (estimateVarianceContainer == null) {
            attemptRuntimeEstimateVariances.putIfAbsent(taskAttempt, new AtomicLong());
            estimateVarianceContainer = attemptRuntimeEstimateVariances.get(taskAttempt);
        }
        long estimate = -1;
        long varianceEstimate = -1;
        // speculative task attempt if two are already running for this task
        if (start > 0 && timestamp > start) {
            estimate = (long) ((timestamp - start) / Math.max(0.0001, progress));
            varianceEstimate = (long) (estimate * progress / 10);
        }
        if (estimateContainer != null) {
            estimateContainer.set(estimate);
        }
        if (estimateVarianceContainer != null) {
            estimateVarianceContainer.set(varianceEstimate);
        }
    }
}
Also used : Task(org.apache.tez.dag.app.dag.Task) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicLong(java.util.concurrent.atomic.AtomicLong) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt)

Example 5 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TestDAGScheduler method testConcurrencyLimit.

@Test(timeout = 5000)
public void testConcurrencyLimit() {
    MockEventHandler mockEventHandler = new MockEventHandler();
    DAG mockDag = mock(DAG.class);
    when(mockDag.getTotalVertices()).thenReturn(2);
    TezVertexID vId0 = TezVertexID.fromString("vertex_1436907267600_195589_1_00");
    TezVertexID vId1 = TezVertexID.fromString("vertex_1436907267600_195589_1_01");
    TezTaskID tId0 = TezTaskID.getInstance(vId0, 0);
    TezTaskID tId1 = TezTaskID.getInstance(vId1, 0);
    TaskAttempt mockAttempt;
    Vertex mockVertex = mock(Vertex.class);
    when(mockDag.getVertex((TezVertexID) any())).thenReturn(mockVertex);
    when(mockVertex.getDistanceFromRoot()).thenReturn(0);
    when(mockVertex.getVertexId()).thenReturn(vId0);
    DAGScheduler scheduler = new DAGSchedulerNaturalOrder(mockDag, mockEventHandler);
    // not effective
    scheduler.addVertexConcurrencyLimit(vId0, 0);
    // schedule beyond limit and it gets scheduled
    mockAttempt = mock(TaskAttempt.class);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId0, 0));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    Assert.assertEquals(1, mockEventHandler.events.size());
    mockAttempt = mock(TaskAttempt.class);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId0, 1));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    Assert.assertEquals(2, mockEventHandler.events.size());
    mockAttempt = mock(TaskAttempt.class);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId0, 2));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    Assert.assertEquals(3, mockEventHandler.events.size());
    mockEventHandler.events.clear();
    List<TaskAttempt> mockAttempts = Lists.newArrayList();
    int completed = 0;
    int requested = 0;
    int scheduled = 0;
    // effective
    scheduler.addVertexConcurrencyLimit(vId1, 2);
    // schedule beyond limit and it gets buffered
    mockAttempt = mock(TaskAttempt.class);
    mockAttempts.add(mockAttempt);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId1, requested++));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    // scheduled
    Assert.assertEquals(scheduled + 1, mockEventHandler.events.size());
    Assert.assertEquals(mockAttempts.get(scheduled).getID(), // matches order
    mockEventHandler.events.get(scheduled).getTaskAttemptID());
    scheduled++;
    mockAttempt = mock(TaskAttempt.class);
    mockAttempts.add(mockAttempt);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId1, requested++));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    // scheduled
    Assert.assertEquals(scheduled + 1, mockEventHandler.events.size());
    Assert.assertEquals(mockAttempts.get(scheduled).getID(), // matches order
    mockEventHandler.events.get(scheduled).getTaskAttemptID());
    scheduled++;
    mockAttempt = mock(TaskAttempt.class);
    mockAttempts.add(mockAttempt);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId1, requested++));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    // buffered
    Assert.assertEquals(scheduled, mockEventHandler.events.size());
    mockAttempt = mock(TaskAttempt.class);
    mockAttempts.add(mockAttempt);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId1, requested++));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    // buffered
    Assert.assertEquals(scheduled, mockEventHandler.events.size());
    scheduler.taskCompleted(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_COMPLETED, mockAttempts.get(completed++)));
    // scheduled
    Assert.assertEquals(scheduled + 1, mockEventHandler.events.size());
    Assert.assertEquals(mockAttempts.get(scheduled).getID(), // matches order
    mockEventHandler.events.get(scheduled).getTaskAttemptID());
    scheduled++;
    scheduler.taskCompleted(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_COMPLETED, mockAttempts.get(completed++)));
    // scheduled
    Assert.assertEquals(scheduled + 1, mockEventHandler.events.size());
    Assert.assertEquals(mockAttempts.get(scheduled).getID(), // matches order
    mockEventHandler.events.get(scheduled).getTaskAttemptID());
    scheduled++;
    scheduler.taskCompleted(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_COMPLETED, mockAttempts.get(completed++)));
    // no extra scheduling
    Assert.assertEquals(scheduled, mockEventHandler.events.size());
    mockAttempt = mock(TaskAttempt.class);
    mockAttempts.add(mockAttempt);
    when(mockAttempt.getID()).thenReturn(TezTaskAttemptID.getInstance(tId1, requested++));
    scheduler.scheduleTask(new DAGEventSchedulerUpdate(DAGEventSchedulerUpdate.UpdateType.TA_SCHEDULE, mockAttempt));
    // scheduled
    Assert.assertEquals(scheduled + 1, mockEventHandler.events.size());
    Assert.assertEquals(mockAttempts.get(scheduled).getID(), // matches order
    mockEventHandler.events.get(scheduled).getTaskAttemptID());
    scheduled++;
}
Also used : Vertex(org.apache.tez.dag.app.dag.Vertex) DAG(org.apache.tez.dag.app.dag.DAG) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) DAGScheduler(org.apache.tez.dag.app.dag.DAGScheduler) TezVertexID(org.apache.tez.dag.records.TezVertexID) TezTaskID(org.apache.tez.dag.records.TezTaskID) DAGEventSchedulerUpdate(org.apache.tez.dag.app.dag.event.DAGEventSchedulerUpdate) Test(org.junit.Test)

Aggregations

TaskAttempt (org.apache.tez.dag.app.dag.TaskAttempt)39 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)20 TezVertexID (org.apache.tez.dag.records.TezVertexID)20 Test (org.junit.Test)18 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)13 Configuration (org.apache.hadoop.conf.Configuration)12 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)12 Priority (org.apache.hadoop.yarn.api.records.Priority)12 Resource (org.apache.hadoop.yarn.api.records.Resource)12 TezDAGID (org.apache.tez.dag.records.TezDAGID)12 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)11 Container (org.apache.hadoop.yarn.api.records.Container)11 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)11 AppContext (org.apache.tez.dag.app.AppContext)11 ClusterInfo (org.apache.tez.dag.app.ClusterInfo)11 ContainerHeartbeatHandler (org.apache.tez.dag.app.ContainerHeartbeatHandler)11 TaskCommunicatorManagerInterface (org.apache.tez.dag.app.TaskCommunicatorManagerInterface)11 AMRMClientAsyncForTest (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientAsyncForTest)11 AMRMClientForTest (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientForTest)11 CapturingEventHandler (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.CapturingEventHandler)11