Search in sources :

Example 1 with TaskAttempt

use of org.apache.hyracks.control.cc.job.TaskAttempt in project asterixdb by apache.

the class JobExecutor method abortTaskCluster.

private void abortTaskCluster(TaskClusterAttempt tcAttempt, TaskClusterAttempt.TaskClusterStatus failedOrAbortedStatus) {
    LOGGER.fine("Aborting task cluster: " + tcAttempt.getAttempt());
    Set<TaskAttemptId> abortTaskIds = new HashSet<>();
    Map<String, List<TaskAttemptId>> abortTaskAttemptMap = new HashMap<>();
    for (TaskAttempt ta : tcAttempt.getTaskAttempts().values()) {
        TaskAttemptId taId = ta.getTaskAttemptId();
        TaskAttempt.TaskStatus status = ta.getStatus();
        abortTaskIds.add(taId);
        LOGGER.fine("Checking " + taId + ": " + ta.getStatus());
        if (status == TaskAttempt.TaskStatus.RUNNING || status == TaskAttempt.TaskStatus.COMPLETED) {
            ta.setStatus(TaskAttempt.TaskStatus.ABORTED, null);
            ta.setEndTime(System.currentTimeMillis());
            List<TaskAttemptId> abortTaskAttempts = abortTaskAttemptMap.get(ta.getNodeId());
            if (status == TaskAttempt.TaskStatus.RUNNING && abortTaskAttempts == null) {
                abortTaskAttempts = new ArrayList<>();
                abortTaskAttemptMap.put(ta.getNodeId(), abortTaskAttempts);
            }
            if (status == TaskAttempt.TaskStatus.RUNNING) {
                abortTaskAttempts.add(taId);
            }
        }
    }
    final JobId jobId = jobRun.getJobId();
    LOGGER.fine("Abort map for job: " + jobId + ": " + abortTaskAttemptMap);
    INodeManager nodeManager = ccs.getNodeManager();
    for (Map.Entry<String, List<TaskAttemptId>> entry : abortTaskAttemptMap.entrySet()) {
        final NodeControllerState node = nodeManager.getNodeControllerState(entry.getKey());
        final List<TaskAttemptId> abortTaskAttempts = entry.getValue();
        if (node != null) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("Aborting: " + abortTaskAttempts + " at " + entry.getKey());
            }
            try {
                node.getNodeController().abortTasks(jobId, abortTaskAttempts);
            } catch (Exception e) {
                LOGGER.log(Level.SEVERE, e.getMessage(), e);
            }
        }
    }
    inProgressTaskClusters.remove(tcAttempt.getTaskCluster());
    TaskCluster tc = tcAttempt.getTaskCluster();
    PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
    pmm.removeUncommittedPartitions(tc.getProducedPartitions(), abortTaskIds);
    pmm.removePartitionRequests(tc.getRequiredPartitions(), abortTaskIds);
    tcAttempt.setStatus(failedOrAbortedStatus);
    tcAttempt.setEndTime(System.currentTimeMillis());
}
Also used : INodeManager(org.apache.hyracks.control.cc.cluster.INodeManager) HashMap(java.util.HashMap) TaskAttemptId(org.apache.hyracks.api.dataflow.TaskAttemptId) PartitionMatchMaker(org.apache.hyracks.control.cc.partitions.PartitionMatchMaker) HyracksException(org.apache.hyracks.api.exceptions.HyracksException) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) ArrayList(java.util.ArrayList) List(java.util.List) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) NodeControllerState(org.apache.hyracks.control.cc.NodeControllerState) HashMap(java.util.HashMap) Map(java.util.Map) JobId(org.apache.hyracks.api.job.JobId) HashSet(java.util.HashSet)

Example 2 with TaskAttempt

use of org.apache.hyracks.control.cc.job.TaskAttempt in project asterixdb by apache.

the class JobExecutor method abortOngoingTaskClusters.

/**
     * Aborts ongoing task clusters.
     *
     * @param taskFilter,
     *            selects tasks that should be directly marked as failed without doing the aborting RPC.
     * @param exceptionGenerator,
     *            generates an exception for tasks that are directly marked as failed.
     */
private void abortOngoingTaskClusters(ITaskFilter taskFilter, IExceptionGenerator exceptionGenerator) throws HyracksException {
    for (ActivityCluster ac : jobRun.getActivityClusterGraph().getActivityClusterMap().values()) {
        if (!isPlanned(ac)) {
            continue;
        }
        TaskCluster[] taskClusters = getActivityClusterPlan(ac).getTaskClusters();
        if (taskClusters == null) {
            continue;
        }
        for (TaskCluster tc : taskClusters) {
            TaskClusterAttempt lastTaskClusterAttempt = findLastTaskClusterAttempt(tc);
            if (lastTaskClusterAttempt == null || !(lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.COMPLETED || lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.RUNNING)) {
                continue;
            }
            boolean abort = false;
            for (TaskAttempt ta : lastTaskClusterAttempt.getTaskAttempts().values()) {
                assert ta.getStatus() == TaskAttempt.TaskStatus.COMPLETED || ta.getStatus() == TaskAttempt.TaskStatus.RUNNING;
                if (taskFilter.directlyMarkAsFailed(ta)) {
                    // Directly mark it as fail, without further aborting.
                    ta.setStatus(TaskAttempt.TaskStatus.FAILED, Collections.singletonList(exceptionGenerator.getException(ta)));
                    ta.setEndTime(System.currentTimeMillis());
                    abort = true;
                }
            }
            if (abort) {
                abortTaskCluster(lastTaskClusterAttempt, TaskClusterAttempt.TaskClusterStatus.ABORTED);
            }
        }
        abortDoomedTaskClusters();
    }
}
Also used : TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) ActivityCluster(org.apache.hyracks.api.job.ActivityCluster)

Example 3 with TaskAttempt

use of org.apache.hyracks.control.cc.job.TaskAttempt in project asterixdb by apache.

the class JobExecutor method findTaskLocation.

private String findTaskLocation(TaskId tid) {
    ActivityId aid = tid.getActivityId();
    ActivityCluster ac = jobRun.getActivityClusterGraph().getActivityMap().get(aid);
    Task[] tasks = getActivityClusterPlan(ac).getActivityPlanMap().get(aid).getTasks();
    List<TaskClusterAttempt> tcAttempts = tasks[tid.getPartition()].getTaskCluster().getAttempts();
    if (tcAttempts == null || tcAttempts.isEmpty()) {
        return null;
    }
    TaskClusterAttempt lastTCA = tcAttempts.get(tcAttempts.size() - 1);
    TaskAttempt ta = lastTCA.getTaskAttempts().get(tid);
    return ta == null ? null : ta.getNodeId();
}
Also used : Task(org.apache.hyracks.control.cc.job.Task) TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) ActivityId(org.apache.hyracks.api.dataflow.ActivityId) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) ActivityCluster(org.apache.hyracks.api.job.ActivityCluster)

Example 4 with TaskAttempt

use of org.apache.hyracks.control.cc.job.TaskAttempt in project asterixdb by apache.

the class AbstractTaskLifecycleWork method runWork.

@Override
public final void runWork() {
    IJobManager jobManager = ccs.getJobManager();
    JobRun run = jobManager.get(jobId);
    if (run != null) {
        TaskId tid = taId.getTaskId();
        Map<ActivityId, ActivityCluster> activityClusterMap = run.getActivityClusterGraph().getActivityMap();
        ActivityCluster ac = activityClusterMap.get(tid.getActivityId());
        if (ac != null) {
            Map<ActivityId, ActivityPlan> taskStateMap = run.getActivityClusterPlanMap().get(ac.getId()).getActivityPlanMap();
            Task[] taskStates = taskStateMap.get(tid.getActivityId()).getTasks();
            if (taskStates != null && taskStates.length > tid.getPartition()) {
                Task ts = taskStates[tid.getPartition()];
                TaskCluster tc = ts.getTaskCluster();
                List<TaskClusterAttempt> taskClusterAttempts = tc.getAttempts();
                if (taskClusterAttempts != null && taskClusterAttempts.size() > taId.getAttempt()) {
                    TaskClusterAttempt tca = taskClusterAttempts.get(taId.getAttempt());
                    TaskAttempt ta = tca.getTaskAttempts().get(tid);
                    if (ta != null) {
                        performEvent(ta);
                    }
                }
            }
        }
    }
}
Also used : Task(org.apache.hyracks.control.cc.job.Task) TaskId(org.apache.hyracks.api.dataflow.TaskId) TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) ActivityId(org.apache.hyracks.api.dataflow.ActivityId) ActivityPlan(org.apache.hyracks.control.cc.job.ActivityPlan) IJobManager(org.apache.hyracks.control.cc.job.IJobManager) ActivityCluster(org.apache.hyracks.api.job.ActivityCluster) TaskCluster(org.apache.hyracks.control.cc.job.TaskCluster) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) JobRun(org.apache.hyracks.control.cc.job.JobRun)

Example 5 with TaskAttempt

use of org.apache.hyracks.control.cc.job.TaskAttempt in project asterixdb by apache.

the class JobExecutor method assignTaskLocations.

private void assignTaskLocations(TaskCluster tc, Map<String, List<TaskAttemptDescriptor>> taskAttemptMap) throws HyracksException {
    ActivityClusterGraph acg = jobRun.getActivityClusterGraph();
    Task[] tasks = tc.getTasks();
    List<TaskClusterAttempt> tcAttempts = tc.getAttempts();
    int attempts = tcAttempts.size();
    TaskClusterAttempt tcAttempt = new TaskClusterAttempt(tc, attempts);
    Map<TaskId, TaskAttempt> taskAttempts = new HashMap<>();
    Map<TaskId, LValueConstraintExpression> locationMap = new HashMap<>();
    for (int i = 0; i < tasks.length; ++i) {
        Task ts = tasks[i];
        TaskId tid = ts.getTaskId();
        TaskAttempt taskAttempt = new TaskAttempt(tcAttempt, new TaskAttemptId(new TaskId(tid.getActivityId(), tid.getPartition()), attempts), ts);
        taskAttempt.setStatus(TaskAttempt.TaskStatus.INITIALIZED, null);
        locationMap.put(tid, new PartitionLocationExpression(tid.getActivityId().getOperatorDescriptorId(), tid.getPartition()));
        taskAttempts.put(tid, taskAttempt);
    }
    tcAttempt.setTaskAttempts(taskAttempts);
    solver.solve(locationMap.values());
    for (int i = 0; i < tasks.length; ++i) {
        Task ts = tasks[i];
        TaskId tid = ts.getTaskId();
        TaskAttempt taskAttempt = taskAttempts.get(tid);
        String nodeId = assignLocation(acg, locationMap, tid, taskAttempt);
        taskAttempt.setNodeId(nodeId);
        taskAttempt.setStatus(TaskAttempt.TaskStatus.RUNNING, null);
        taskAttempt.setStartTime(System.currentTimeMillis());
        List<TaskAttemptDescriptor> tads = taskAttemptMap.get(nodeId);
        if (tads == null) {
            tads = new ArrayList<>();
            taskAttemptMap.put(nodeId, tads);
        }
        OperatorDescriptorId opId = tid.getActivityId().getOperatorDescriptorId();
        jobRun.registerOperatorLocation(opId, tid.getPartition(), nodeId);
        ActivityPartitionDetails apd = ts.getActivityPlan().getActivityPartitionDetails();
        TaskAttemptDescriptor tad = new TaskAttemptDescriptor(taskAttempt.getTaskAttemptId(), apd.getPartitionCount(), apd.getInputPartitionCounts(), apd.getOutputPartitionCounts());
        tads.add(tad);
    }
    tcAttempt.initializePendingTaskCounter();
    tcAttempts.add(tcAttempt);
    /**
         * Improvement for reducing master/slave message communications, for each TaskAttemptDescriptor,
         * we set the NetworkAddress[][] partitionLocations, in which each row is for an incoming connector descriptor
         * and each column is for an input channel of the connector.
         */
    INodeManager nodeManager = ccs.getNodeManager();
    for (Map.Entry<String, List<TaskAttemptDescriptor>> e : taskAttemptMap.entrySet()) {
        List<TaskAttemptDescriptor> tads = e.getValue();
        for (TaskAttemptDescriptor tad : tads) {
            TaskAttemptId taid = tad.getTaskAttemptId();
            int attempt = taid.getAttempt();
            TaskId tid = taid.getTaskId();
            ActivityId aid = tid.getActivityId();
            List<IConnectorDescriptor> inConnectors = acg.getActivityInputs(aid);
            int[] inPartitionCounts = tad.getInputPartitionCounts();
            if (inPartitionCounts == null) {
                continue;
            }
            NetworkAddress[][] partitionLocations = new NetworkAddress[inPartitionCounts.length][];
            for (int i = 0; i < inPartitionCounts.length; ++i) {
                ConnectorDescriptorId cdId = inConnectors.get(i).getConnectorId();
                IConnectorPolicy policy = jobRun.getConnectorPolicyMap().get(cdId);
                /**
                     * carry sender location information into a task
                     * when it is not the case that it is an re-attempt and the send-side
                     * is materialized blocking.
                     */
                if (attempt > 0 && policy.materializeOnSendSide() && policy.consumerWaitsForProducerToFinish()) {
                    continue;
                }
                ActivityId producerAid = acg.getProducerActivity(cdId);
                partitionLocations[i] = new NetworkAddress[inPartitionCounts[i]];
                for (int j = 0; j < inPartitionCounts[i]; ++j) {
                    TaskId producerTaskId = new TaskId(producerAid, j);
                    String nodeId = findTaskLocation(producerTaskId);
                    partitionLocations[i][j] = nodeManager.getNodeControllerState(nodeId).getDataPort();
                }
            }
            tad.setInputPartitionLocations(partitionLocations);
        }
    }
    tcAttempt.setStatus(TaskClusterAttempt.TaskClusterStatus.RUNNING);
    tcAttempt.setStartTime(System.currentTimeMillis());
    inProgressTaskClusters.add(tc);
}
Also used : INodeManager(org.apache.hyracks.control.cc.cluster.INodeManager) Task(org.apache.hyracks.control.cc.job.Task) TaskId(org.apache.hyracks.api.dataflow.TaskId) TaskClusterAttempt(org.apache.hyracks.control.cc.job.TaskClusterAttempt) HashMap(java.util.HashMap) ActivityId(org.apache.hyracks.api.dataflow.ActivityId) ConnectorDescriptorId(org.apache.hyracks.api.dataflow.ConnectorDescriptorId) NetworkAddress(org.apache.hyracks.api.comm.NetworkAddress) ArrayList(java.util.ArrayList) List(java.util.List) TaskAttempt(org.apache.hyracks.control.cc.job.TaskAttempt) PartitionLocationExpression(org.apache.hyracks.api.constraints.expressions.PartitionLocationExpression) IConnectorDescriptor(org.apache.hyracks.api.dataflow.IConnectorDescriptor) OperatorDescriptorId(org.apache.hyracks.api.dataflow.OperatorDescriptorId) TaskAttemptId(org.apache.hyracks.api.dataflow.TaskAttemptId) IConnectorPolicy(org.apache.hyracks.api.dataflow.connectors.IConnectorPolicy) Constraint(org.apache.hyracks.api.constraints.Constraint) LValueConstraintExpression(org.apache.hyracks.api.constraints.expressions.LValueConstraintExpression) TaskAttemptDescriptor(org.apache.hyracks.control.common.job.TaskAttemptDescriptor) ActivityClusterGraph(org.apache.hyracks.api.job.ActivityClusterGraph) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

TaskAttempt (org.apache.hyracks.control.cc.job.TaskAttempt)6 TaskClusterAttempt (org.apache.hyracks.control.cc.job.TaskClusterAttempt)5 TaskCluster (org.apache.hyracks.control.cc.job.TaskCluster)4 ActivityId (org.apache.hyracks.api.dataflow.ActivityId)3 TaskAttemptId (org.apache.hyracks.api.dataflow.TaskAttemptId)3 ActivityCluster (org.apache.hyracks.api.job.ActivityCluster)3 Task (org.apache.hyracks.control.cc.job.Task)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 TaskId (org.apache.hyracks.api.dataflow.TaskId)2 INodeManager (org.apache.hyracks.control.cc.cluster.INodeManager)2 HashSet (java.util.HashSet)1 NetworkAddress (org.apache.hyracks.api.comm.NetworkAddress)1 Constraint (org.apache.hyracks.api.constraints.Constraint)1 LValueConstraintExpression (org.apache.hyracks.api.constraints.expressions.LValueConstraintExpression)1 PartitionLocationExpression (org.apache.hyracks.api.constraints.expressions.PartitionLocationExpression)1 ConnectorDescriptorId (org.apache.hyracks.api.dataflow.ConnectorDescriptorId)1 IConnectorDescriptor (org.apache.hyracks.api.dataflow.IConnectorDescriptor)1