Search in sources :

Example 1 with TaskCluster

use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.

the class JobExecutor method abortTaskCluster.

private void abortTaskCluster(TaskClusterAttempt tcAttempt, TaskClusterAttempt.TaskClusterStatus failedOrAbortedStatus) {
    LOGGER.fine("Aborting task cluster: " + tcAttempt.getAttempt());
    Set<TaskAttemptId> abortTaskIds = new HashSet<>();
    Map<String, List<TaskAttemptId>> abortTaskAttemptMap = new HashMap<>();
    for (TaskAttempt ta : tcAttempt.getTaskAttempts().values()) {
        TaskAttemptId taId = ta.getTaskAttemptId();
        TaskAttempt.TaskStatus status = ta.getStatus();
        abortTaskIds.add(taId);
        LOGGER.fine("Checking " + taId + ": " + ta.getStatus());
        if (status == TaskAttempt.TaskStatus.RUNNING || status == TaskAttempt.TaskStatus.COMPLETED) {
            ta.setStatus(TaskAttempt.TaskStatus.ABORTED, null);
            ta.setEndTime(System.currentTimeMillis());
            List<TaskAttemptId> abortTaskAttempts = abortTaskAttemptMap.get(ta.getNodeId());
            if (status == TaskAttempt.TaskStatus.RUNNING && abortTaskAttempts == null) {
                abortTaskAttempts = new ArrayList<>();
                abortTaskAttemptMap.put(ta.getNodeId(), abortTaskAttempts);
            }
            if (status == TaskAttempt.TaskStatus.RUNNING) {
                abortTaskAttempts.add(taId);
            }
        }
    }
    final JobId jobId = jobRun.getJobId();
    LOGGER.fine("Abort map for job: " + jobId + ": " + abortTaskAttemptMap);
    INodeManager nodeManager = ccs.getNodeManager();
    for (Map.Entry<String, List<TaskAttemptId>> entry : abortTaskAttemptMap.entrySet()) {
        final NodeControllerState node = nodeManager.getNodeControllerState(entry.getKey());
        final List<TaskAttemptId> abortTaskAttempts = entry.getValue();
        if (node != null) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("Aborting: " + abortTaskAttempts + " at " + entry.getKey());
            }
            try {
                node.getNodeController().abortTasks(jobId, abortTaskAttempts);
            } catch (Exception e) {
                LOGGER.log(Level.SEVERE, e.getMessage(), e);
            }
        }
    }
    inProgressTaskClusters.remove(tcAttempt.getTaskCluster());
    TaskCluster tc = tcAttempt.getTaskCluster();
    PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
    pmm.removeUncommittedPartitions(tc.getProducedPartitions(), abortTaskIds);
    pmm.removePartitionRequests(tc.getRequiredPartitions(), abortTaskIds);
    tcAttempt.setStatus(failedOrAbortedStatus);
    tcAttempt.setEndTime(System.currentTimeMillis());
}
Also used : INodeManager(org.apache.hyracks.control.cc.cluster.INodeManager) HashMap(java.util.HashMap) TaskAttemptId(org.apache.hyracks.api.dataflow.TaskAttemptId) PartitionMatchMaker(org.apache.hyracks.control.cc.partitions.PartitionMatchMaker) HyracksException(org.apache.hyracks.api.exceptions.HyracksException) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) ArrayList(java.util.ArrayList) List(java.util.List) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) NodeControllerState(org.apache.hyracks.control.cc.NodeControllerState) HashMap(java.util.HashMap) Map(java.util.Map) JobId(org.apache.hyracks.api.job.JobId) HashSet(java.util.HashSet)

Example 2 with TaskCluster

use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.

the class JobExecutor method notifyTaskFailure.

/**
     * Indicates that a single task attempt has encountered a failure.
     * @param ta Failed Task Attempt
     * @param exceptions exeptions thrown during the failure
     */
public void notifyTaskFailure(TaskAttempt ta, List<Exception> exceptions) {
    try {
        LOGGER.fine("Received failure notification for TaskAttempt " + ta.getTaskAttemptId());
        TaskAttemptId taId = ta.getTaskAttemptId();
        TaskCluster tc = ta.getTask().getTaskCluster();
        TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
        if (lastAttempt != null && taId.getAttempt() == lastAttempt.getAttempt()) {
            LOGGER.fine("Marking TaskAttempt " + ta.getTaskAttemptId() + " as failed");
            ta.setStatus(TaskAttempt.TaskStatus.FAILED, exceptions);
            abortTaskCluster(lastAttempt, TaskClusterAttempt.TaskClusterStatus.FAILED);
            abortDoomedTaskClusters();
            if (lastAttempt.getAttempt() >= jobRun.getActivityClusterGraph().getMaxReattempts() || isCancelled()) {
                abortJob(exceptions);
                return;
            }
            startRunnableActivityClusters();
        } else {
            LOGGER.warning("Ignoring task failure notification: " + taId + " -- Current last attempt = " + lastAttempt);
        }
    } catch (Exception e) {
        abortJob(Collections.singletonList(e));
    }
}
Also used : TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) TaskAttemptId(org.apache.hyracks.api.dataflow.TaskAttemptId) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) HyracksException(org.apache.hyracks.api.exceptions.HyracksException)

Example 3 with TaskCluster

use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.

the class JobExecutor method startRunnableTaskClusters.

private void startRunnableTaskClusters(Set<TaskCluster> tcRoots) throws HyracksException {
    Map<TaskCluster, Runnability> runnabilityMap = new HashMap<>();
    for (TaskCluster tc : tcRoots) {
        assignRunnabilityRank(tc, runnabilityMap);
    }
    PriorityQueue<RankedRunnableTaskCluster> queue = new PriorityQueue<>();
    for (Map.Entry<TaskCluster, Runnability> e : runnabilityMap.entrySet()) {
        TaskCluster tc = e.getKey();
        Runnability runnability = e.getValue();
        if (runnability.getTag() != Runnability.Tag.RUNNABLE) {
            continue;
        }
        int priority = runnability.getPriority();
        if (priority >= 0 && priority < Integer.MAX_VALUE) {
            queue.add(new RankedRunnableTaskCluster(priority, tc));
        }
    }
    if (LOGGER.isLoggable(Level.FINE)) {
        LOGGER.fine("Ranked TCs: " + queue);
    }
    Map<String, List<TaskAttemptDescriptor>> taskAttemptMap = new HashMap<>();
    for (RankedRunnableTaskCluster rrtc : queue) {
        TaskCluster tc = rrtc.getTaskCluster();
        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("Found runnable TC: " + tc);
            List<TaskClusterAttempt> attempts = tc.getAttempts();
            LOGGER.fine("Attempts so far:" + attempts.size());
            for (TaskClusterAttempt tcAttempt : attempts) {
                LOGGER.fine("Status: " + tcAttempt.getStatus());
            }
        }
        assignTaskLocations(tc, taskAttemptMap);
    }
    if (taskAttemptMap.isEmpty()) {
        return;
    }
    startTasks(taskAttemptMap);
}
Also used : HashMap(java.util.HashMap) TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) PriorityQueue(java.util.PriorityQueue) Constraint(org.apache.hyracks.api.constraints.Constraint) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with TaskCluster

use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.

the class JobExecutor method findDoomedTaskClusters.

private boolean findDoomedTaskClusters(TaskCluster tc, Set<TaskCluster> doomedTaskClusters) {
    if (doomedTaskClusters.contains(tc)) {
        return true;
    }
    TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
    if (lastAttempt != null) {
        switch(lastAttempt.getStatus()) {
            case ABORTED:
            case FAILED:
            case COMPLETED:
                return false;
            default:
                break;
        }
    }
    Map<ConnectorDescriptorId, IConnectorPolicy> connectorPolicyMap = jobRun.getConnectorPolicyMap();
    PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
    boolean doomed = false;
    for (TaskCluster depTC : tc.getDependencyTaskClusters()) {
        if (findDoomedTaskClusters(depTC, doomedTaskClusters)) {
            doomed = true;
        }
    }
    for (PartitionId pid : tc.getRequiredPartitions()) {
        ConnectorDescriptorId cdId = pid.getConnectorDescriptorId();
        IConnectorPolicy cPolicy = connectorPolicyMap.get(cdId);
        PartitionState maxState = pmm.getMaximumAvailableState(pid);
        if ((maxState == null || (cPolicy.consumerWaitsForProducerToFinish() && maxState != PartitionState.COMMITTED)) && findDoomedTaskClusters(partitionProducingTaskClusterMap.get(pid), doomedTaskClusters)) {
            doomed = true;
        }
    }
    if (doomed) {
        doomedTaskClusters.add(tc);
    }
    return doomed;
}
Also used : TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) IConnectorPolicy(org.apache.hyracks.api.dataflow.connectors.IConnectorPolicy) ConnectorDescriptorId(org.apache.hyracks.api.dataflow.ConnectorDescriptorId) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) PartitionState(org.apache.hyracks.control.common.job.PartitionState) PartitionMatchMaker(org.apache.hyracks.control.cc.partitions.PartitionMatchMaker) PartitionId(org.apache.hyracks.api.partitions.PartitionId)

Example 5 with TaskCluster

use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.

the class JobExecutor method abortOngoingTaskClusters.

/**
     * Aborts ongoing task clusters.
     *
     * @param taskFilter,
     *            selects tasks that should be directly marked as failed without doing the aborting RPC.
     * @param exceptionGenerator,
     *            generates an exception for tasks that are directly marked as failed.
     */
private void abortOngoingTaskClusters(ITaskFilter taskFilter, IExceptionGenerator exceptionGenerator) throws HyracksException {
    for (ActivityCluster ac : jobRun.getActivityClusterGraph().getActivityClusterMap().values()) {
        if (!isPlanned(ac)) {
            continue;
        }
        TaskCluster[] taskClusters = getActivityClusterPlan(ac).getTaskClusters();
        if (taskClusters == null) {
            continue;
        }
        for (TaskCluster tc : taskClusters) {
            TaskClusterAttempt lastTaskClusterAttempt = findLastTaskClusterAttempt(tc);
            if (lastTaskClusterAttempt == null || !(lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.COMPLETED || lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.RUNNING)) {
                continue;
            }
            boolean abort = false;
            for (TaskAttempt ta : lastTaskClusterAttempt.getTaskAttempts().values()) {
                assert ta.getStatus() == TaskAttempt.TaskStatus.COMPLETED || ta.getStatus() == TaskAttempt.TaskStatus.RUNNING;
                if (taskFilter.directlyMarkAsFailed(ta)) {
                    // Directly mark it as fail, without further aborting.
                    ta.setStatus(TaskAttempt.TaskStatus.FAILED, Collections.singletonList(exceptionGenerator.getException(ta)));
                    ta.setEndTime(System.currentTimeMillis());
                    abort = true;
                }
            }
            if (abort) {
                abortTaskCluster(lastTaskClusterAttempt, TaskClusterAttempt.TaskClusterStatus.ABORTED);
            }
        }
        abortDoomedTaskClusters();
    }
}
Also used : TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) ActivityCluster(org.apache.hyracks.api.job.ActivityCluster)

Aggregations

TaskCluster (org.apache.hyracks.control.cc.job.TaskCluster)14 TaskClusterAttempt (org.apache.hyracks.control.cc.job.TaskClusterAttempt)7 ArrayList (java.util.ArrayList)5 ActivityId (org.apache.hyracks.api.dataflow.ActivityId)5 HashSet (java.util.HashSet)4 List (java.util.List)4 ActivityPlan (org.apache.hyracks.control.cc.job.ActivityPlan)4 Task (org.apache.hyracks.control.cc.job.Task)4 TaskAttempt (org.apache.hyracks.control.cc.job.TaskAttempt)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 ConnectorDescriptorId (org.apache.hyracks.api.dataflow.ConnectorDescriptorId)3 TaskAttemptId (org.apache.hyracks.api.dataflow.TaskAttemptId)3 TaskId (org.apache.hyracks.api.dataflow.TaskId)3 ActivityCluster (org.apache.hyracks.api.job.ActivityCluster)3 JobRun (org.apache.hyracks.control.cc.job.JobRun)3 IConnectorPolicy (org.apache.hyracks.api.dataflow.connectors.IConnectorPolicy)2 HyracksException (org.apache.hyracks.api.exceptions.HyracksException)2 PartitionId (org.apache.hyracks.api.partitions.PartitionId)2 ActivityClusterPlan (org.apache.hyracks.control.cc.job.ActivityClusterPlan)2