Search in sources :

Example 31 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskImpl method canCommit.

@Override
public boolean canCommit(TezTaskAttemptID taskAttemptID) {
    writeLock.lock();
    try {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Commit go/no-go request from " + taskAttemptID);
        }
        TaskState state = getState();
        if (state == TaskState.SCHEDULED) {
            // the actual running task ran and is done and asking for commit. we are still stuck
            // in the scheduled state which indicates a backlog in event processing. lets wait for the
            // backlog to clear. returning false will make the attempt come back to us.
            LOG.info("Event processing delay. " + "Attempt committing before state machine transitioned to running : Task {}", taskId);
            return false;
        }
        // have been in scheduled state in task impl.
        if (state != TaskState.RUNNING) {
            LOG.info("Task not running. Issuing kill to bad commit attempt " + taskAttemptID);
            eventHandler.handle(new TaskAttemptEventKillRequest(taskAttemptID, "Task not running. Bad attempt.", TaskAttemptTerminationCause.TERMINATED_ORPHANED));
            return false;
        }
        if (commitAttempt == null) {
            TaskAttempt ta = getAttempt(taskAttemptID);
            if (ta == null) {
                throw new TezUncheckedException("Unknown task for commit: " + taskAttemptID);
            }
            // Its ok to get a non-locked state snapshot since we handle changes of
            // state in the task attempt. Dont want to deadlock here.
            TaskAttemptState taState = ta.getStateNoLock();
            if (taState == TaskAttemptState.RUNNING) {
                commitAttempt = taskAttemptID;
                LOG.info(taskAttemptID + " given a go for committing the task output.");
                return true;
            } else {
                LOG.info(taskAttemptID + " with state: " + taState + " given a no-go for commit because its not running.");
                return false;
            }
        } else {
            if (commitAttempt.equals(taskAttemptID)) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(taskAttemptID + " already given a go for committing the task output.");
                }
                return true;
            }
            // succeeds then this and others will be killed
            if (LOG.isDebugEnabled()) {
                LOG.debug(commitAttempt + " is current committer. Commit waiting for:  " + taskAttemptID);
            }
            return false;
        }
    } finally {
        writeLock.unlock();
    }
}
Also used : TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TaskAttemptState(org.apache.tez.dag.api.oldrecords.TaskAttemptState) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TaskState(org.apache.tez.dag.api.oldrecords.TaskState) TaskAttemptEventKillRequest(org.apache.tez.dag.app.dag.event.TaskAttemptEventKillRequest)

Example 32 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskImpl method getCounters.

@Override
public TezCounters getCounters() {
    TezCounters counters = new TezCounters();
    counters.incrAllCounters(this.counters);
    readLock.lock();
    try {
        TaskAttempt bestAttempt = selectBestAttempt();
        if (bestAttempt != null) {
            counters.incrAllCounters(bestAttempt.getCounters());
        }
        return counters;
    } finally {
        readLock.unlock();
    }
}
Also used : TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TezCounters(org.apache.tez.common.counters.TezCounters)

Example 33 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskSchedulerManager method handleTaLaunchRequest.

private void handleTaLaunchRequest(AMSchedulerEventTALaunchRequest event) {
    TaskAttempt taskAttempt = event.getTaskAttempt();
    TaskLocationHint locationHint = event.getLocationHint();
    String[] hosts = null;
    String[] racks = null;
    if (locationHint != null) {
        TaskBasedLocationAffinity taskAffinity = locationHint.getAffinitizedTask();
        if (taskAffinity != null) {
            Vertex vertex = appContext.getCurrentDAG().getVertex(taskAffinity.getVertexName());
            Preconditions.checkNotNull(vertex, "Invalid vertex in task based affinity " + taskAffinity + " for attempt: " + taskAttempt.getID());
            int taskIndex = taskAffinity.getTaskIndex();
            Preconditions.checkState(taskIndex >= 0 && taskIndex < vertex.getTotalTasks(), "Invalid taskIndex in task based affinity " + taskAffinity + " for attempt: " + taskAttempt.getID());
            TaskAttempt affinityAttempt = vertex.getTask(taskIndex).getSuccessfulAttempt();
            if (affinityAttempt != null) {
                Preconditions.checkNotNull(affinityAttempt.getAssignedContainerID(), affinityAttempt.getID());
                try {
                    taskSchedulers[event.getSchedulerId()].allocateTask(taskAttempt, event.getCapability(), affinityAttempt.getAssignedContainerID(), Priority.newInstance(event.getPriority()), event.getContainerContext(), event);
                } catch (Exception e) {
                    String msg = "Error in TaskScheduler for handling Task Allocation" + ", eventType=" + event.getType() + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(event.getSchedulerId(), appContext) + ", taskAttemptId=" + taskAttempt.getID();
                    LOG.error(msg, e);
                    sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
                }
                return;
            }
            LOG.info("No attempt for task affinity to " + taskAffinity + " for attempt " + taskAttempt.getID() + " Ignoring.");
        // fall through with null hosts/racks
        } else {
            hosts = (locationHint.getHosts() != null) ? locationHint.getHosts().toArray(new String[locationHint.getHosts().size()]) : null;
            racks = (locationHint.getRacks() != null) ? locationHint.getRacks().toArray(new String[locationHint.getRacks().size()]) : null;
        }
    }
    try {
        taskSchedulers[event.getSchedulerId()].allocateTask(taskAttempt, event.getCapability(), hosts, racks, Priority.newInstance(event.getPriority()), event.getContainerContext(), event);
    } catch (Exception e) {
        String msg = "Error in TaskScheduler for handling Task Allocation" + ", eventType=" + event.getType() + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(event.getSchedulerId(), appContext) + ", taskAttemptId=" + taskAttempt.getID();
        LOG.error(msg, e);
        sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
    }
}
Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) DAGAppMasterEventUserServiceFatalError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError) TaskBasedLocationAffinity(org.apache.tez.dag.api.TaskLocationHint.TaskBasedLocationAffinity) Vertex(org.apache.tez.dag.app.dag.Vertex) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TezException(org.apache.tez.dag.api.TezException)

Example 34 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskSchedulerManager method handleTAUnsuccessfulEnd.

private void handleTAUnsuccessfulEnd(AMSchedulerEventTAEnded event) {
    TaskAttempt attempt = event.getAttempt();
    // Propagate state and failure cause (if any) when informing the scheduler about the de-allocation.
    boolean wasContainerAllocated = false;
    try {
        wasContainerAllocated = taskSchedulers[event.getSchedulerId()].deallocateTask(attempt, false, event.getTaskAttemptEndReason(), event.getDiagnostics());
    } catch (Exception e) {
        String msg = "Error in TaskScheduler for handling Task De-allocation" + ", eventType=" + event.getType() + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(event.getSchedulerId(), appContext) + ", taskAttemptId=" + attempt.getID();
        LOG.error(msg, e);
        sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
        return;
    }
    // use stored value of container id in case the scheduler has removed this
    // assignment because the task has been deallocated earlier.
    // retroactive case
    ContainerId attemptContainerId = attempt.getAssignedContainerID();
    if (!wasContainerAllocated) {
        LOG.info("Task: " + attempt.getID() + " has no container assignment in the scheduler");
        if (attemptContainerId != null) {
            LOG.error("No container allocated to task: " + attempt.getID() + " according to scheduler. Task reported container id: " + attemptContainerId);
        }
    }
    if (attemptContainerId != null) {
        // TODO either ways send the necessary events
        // Ask the container to stop.
        sendEvent(new AMContainerEventStopRequest(attemptContainerId));
        // Inform the Node - the task has asked to be STOPPED / has already
        // stopped.
        // AMNodeImpl blacklisting logic does not account for KILLED attempts.
        sendEvent(new AMNodeEventTaskAttemptEnded(appContext.getAllContainers().get(attemptContainerId).getContainer().getNodeId(), event.getSchedulerId(), attemptContainerId, attempt.getID(), event.getState() == TaskAttemptState.FAILED));
    }
}
Also used : DAGAppMasterEventUserServiceFatalError(org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) AMContainerEventStopRequest(org.apache.tez.dag.app.rm.container.AMContainerEventStopRequest) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) TezUncheckedException(org.apache.tez.dag.api.TezUncheckedException) TezException(org.apache.tez.dag.api.TezException) AMNodeEventTaskAttemptEnded(org.apache.tez.dag.app.rm.node.AMNodeEventTaskAttemptEnded)

Example 35 with TaskAttempt

use of org.apache.tez.dag.app.dag.TaskAttempt in project tez by apache.

the class TaskSchedulerManager method taskAllocated.

// TaskSchedulerAppCallback methods with schedulerId, where relevant
public synchronized void taskAllocated(int schedulerId, Object task, Object appCookie, Container container) {
    AMSchedulerEventTALaunchRequest event = (AMSchedulerEventTALaunchRequest) appCookie;
    ContainerId containerId = container.getId();
    if (appContext.getAllContainers().addContainerIfNew(container, schedulerId, event.getLauncherId(), event.getTaskCommId())) {
        appContext.getNodeTracker().nodeSeen(container.getNodeId(), schedulerId);
        sendEvent(new AMNodeEventContainerAllocated(container.getNodeId(), schedulerId, container.getId()));
    }
    TaskAttempt taskAttempt = event.getTaskAttempt();
    // taskAllocated() upcall
    assert task.equals(taskAttempt);
    if (appContext.getAllContainers().get(containerId).getState() == AMContainerState.ALLOCATED) {
        sendEvent(new AMContainerEventLaunchRequest(containerId, taskAttempt.getVertexID(), event.getContainerContext(), event.getLauncherId(), event.getTaskCommId()));
    }
    sendEvent(new AMContainerEventAssignTA(containerId, taskAttempt.getID(), event.getRemoteTaskSpec(), event.getContainerContext().getLocalResources(), event.getContainerContext().getCredentials(), event.getPriority()));
}
Also used : AMNodeEventContainerAllocated(org.apache.tez.dag.app.rm.node.AMNodeEventContainerAllocated) AMContainerEventAssignTA(org.apache.tez.dag.app.rm.container.AMContainerEventAssignTA) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) TaskAttempt(org.apache.tez.dag.app.dag.TaskAttempt) AMContainerEventLaunchRequest(org.apache.tez.dag.app.rm.container.AMContainerEventLaunchRequest)

Aggregations

TaskAttempt (org.apache.tez.dag.app.dag.TaskAttempt)39 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)20 TezVertexID (org.apache.tez.dag.records.TezVertexID)20 Test (org.junit.Test)18 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)13 Configuration (org.apache.hadoop.conf.Configuration)12 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)12 Priority (org.apache.hadoop.yarn.api.records.Priority)12 Resource (org.apache.hadoop.yarn.api.records.Resource)12 TezDAGID (org.apache.tez.dag.records.TezDAGID)12 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)11 Container (org.apache.hadoop.yarn.api.records.Container)11 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)11 AppContext (org.apache.tez.dag.app.AppContext)11 ClusterInfo (org.apache.tez.dag.app.ClusterInfo)11 ContainerHeartbeatHandler (org.apache.tez.dag.app.ContainerHeartbeatHandler)11 TaskCommunicatorManagerInterface (org.apache.tez.dag.app.TaskCommunicatorManagerInterface)11 AMRMClientAsyncForTest (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientAsyncForTest)11 AMRMClientForTest (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.AMRMClientForTest)11 CapturingEventHandler (org.apache.tez.dag.app.rm.TestTaskSchedulerHelpers.CapturingEventHandler)11