use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.
the class JobExecutor method abortTaskCluster.
private void abortTaskCluster(TaskClusterAttempt tcAttempt, TaskClusterAttempt.TaskClusterStatus failedOrAbortedStatus) {
LOGGER.fine("Aborting task cluster: " + tcAttempt.getAttempt());
Set<TaskAttemptId> abortTaskIds = new HashSet<>();
Map<String, List<TaskAttemptId>> abortTaskAttemptMap = new HashMap<>();
for (TaskAttempt ta : tcAttempt.getTaskAttempts().values()) {
TaskAttemptId taId = ta.getTaskAttemptId();
TaskAttempt.TaskStatus status = ta.getStatus();
abortTaskIds.add(taId);
LOGGER.fine("Checking " + taId + ": " + ta.getStatus());
if (status == TaskAttempt.TaskStatus.RUNNING || status == TaskAttempt.TaskStatus.COMPLETED) {
ta.setStatus(TaskAttempt.TaskStatus.ABORTED, null);
ta.setEndTime(System.currentTimeMillis());
List<TaskAttemptId> abortTaskAttempts = abortTaskAttemptMap.get(ta.getNodeId());
if (status == TaskAttempt.TaskStatus.RUNNING && abortTaskAttempts == null) {
abortTaskAttempts = new ArrayList<>();
abortTaskAttemptMap.put(ta.getNodeId(), abortTaskAttempts);
}
if (status == TaskAttempt.TaskStatus.RUNNING) {
abortTaskAttempts.add(taId);
}
}
}
final JobId jobId = jobRun.getJobId();
LOGGER.fine("Abort map for job: " + jobId + ": " + abortTaskAttemptMap);
INodeManager nodeManager = ccs.getNodeManager();
for (Map.Entry<String, List<TaskAttemptId>> entry : abortTaskAttemptMap.entrySet()) {
final NodeControllerState node = nodeManager.getNodeControllerState(entry.getKey());
final List<TaskAttemptId> abortTaskAttempts = entry.getValue();
if (node != null) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Aborting: " + abortTaskAttempts + " at " + entry.getKey());
}
try {
node.getNodeController().abortTasks(jobId, abortTaskAttempts);
} catch (Exception e) {
LOGGER.log(Level.SEVERE, e.getMessage(), e);
}
}
}
inProgressTaskClusters.remove(tcAttempt.getTaskCluster());
TaskCluster tc = tcAttempt.getTaskCluster();
PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
pmm.removeUncommittedPartitions(tc.getProducedPartitions(), abortTaskIds);
pmm.removePartitionRequests(tc.getRequiredPartitions(), abortTaskIds);
tcAttempt.setStatus(failedOrAbortedStatus);
tcAttempt.setEndTime(System.currentTimeMillis());
}
use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.
the class JobExecutor method notifyTaskFailure.
/**
* Indicates that a single task attempt has encountered a failure.
* @param ta Failed Task Attempt
* @param exceptions exeptions thrown during the failure
*/
public void notifyTaskFailure(TaskAttempt ta, List<Exception> exceptions) {
try {
LOGGER.fine("Received failure notification for TaskAttempt " + ta.getTaskAttemptId());
TaskAttemptId taId = ta.getTaskAttemptId();
TaskCluster tc = ta.getTask().getTaskCluster();
TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
if (lastAttempt != null && taId.getAttempt() == lastAttempt.getAttempt()) {
LOGGER.fine("Marking TaskAttempt " + ta.getTaskAttemptId() + " as failed");
ta.setStatus(TaskAttempt.TaskStatus.FAILED, exceptions);
abortTaskCluster(lastAttempt, TaskClusterAttempt.TaskClusterStatus.FAILED);
abortDoomedTaskClusters();
if (lastAttempt.getAttempt() >= jobRun.getActivityClusterGraph().getMaxReattempts() || isCancelled()) {
abortJob(exceptions);
return;
}
startRunnableActivityClusters();
} else {
LOGGER.warning("Ignoring task failure notification: " + taId + " -- Current last attempt = " + lastAttempt);
}
} catch (Exception e) {
abortJob(Collections.singletonList(e));
}
}
use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.
the class JobExecutor method startRunnableTaskClusters.
private void startRunnableTaskClusters(Set<TaskCluster> tcRoots) throws HyracksException {
Map<TaskCluster, Runnability> runnabilityMap = new HashMap<>();
for (TaskCluster tc : tcRoots) {
assignRunnabilityRank(tc, runnabilityMap);
}
PriorityQueue<RankedRunnableTaskCluster> queue = new PriorityQueue<>();
for (Map.Entry<TaskCluster, Runnability> e : runnabilityMap.entrySet()) {
TaskCluster tc = e.getKey();
Runnability runnability = e.getValue();
if (runnability.getTag() != Runnability.Tag.RUNNABLE) {
continue;
}
int priority = runnability.getPriority();
if (priority >= 0 && priority < Integer.MAX_VALUE) {
queue.add(new RankedRunnableTaskCluster(priority, tc));
}
}
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Ranked TCs: " + queue);
}
Map<String, List<TaskAttemptDescriptor>> taskAttemptMap = new HashMap<>();
for (RankedRunnableTaskCluster rrtc : queue) {
TaskCluster tc = rrtc.getTaskCluster();
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Found runnable TC: " + tc);
List<TaskClusterAttempt> attempts = tc.getAttempts();
LOGGER.fine("Attempts so far:" + attempts.size());
for (TaskClusterAttempt tcAttempt : attempts) {
LOGGER.fine("Status: " + tcAttempt.getStatus());
}
}
assignTaskLocations(tc, taskAttemptMap);
}
if (taskAttemptMap.isEmpty()) {
return;
}
startTasks(taskAttemptMap);
}
use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.
the class JobExecutor method findDoomedTaskClusters.
private boolean findDoomedTaskClusters(TaskCluster tc, Set<TaskCluster> doomedTaskClusters) {
if (doomedTaskClusters.contains(tc)) {
return true;
}
TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
if (lastAttempt != null) {
switch(lastAttempt.getStatus()) {
case ABORTED:
case FAILED:
case COMPLETED:
return false;
default:
break;
}
}
Map<ConnectorDescriptorId, IConnectorPolicy> connectorPolicyMap = jobRun.getConnectorPolicyMap();
PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
boolean doomed = false;
for (TaskCluster depTC : tc.getDependencyTaskClusters()) {
if (findDoomedTaskClusters(depTC, doomedTaskClusters)) {
doomed = true;
}
}
for (PartitionId pid : tc.getRequiredPartitions()) {
ConnectorDescriptorId cdId = pid.getConnectorDescriptorId();
IConnectorPolicy cPolicy = connectorPolicyMap.get(cdId);
PartitionState maxState = pmm.getMaximumAvailableState(pid);
if ((maxState == null || (cPolicy.consumerWaitsForProducerToFinish() && maxState != PartitionState.COMMITTED)) && findDoomedTaskClusters(partitionProducingTaskClusterMap.get(pid), doomedTaskClusters)) {
doomed = true;
}
}
if (doomed) {
doomedTaskClusters.add(tc);
}
return doomed;
}
use of org.apache.hyracks.control.cc.job.TaskCluster in project asterixdb by apache.
the class JobExecutor method abortOngoingTaskClusters.
/**
* Aborts ongoing task clusters.
*
* @param taskFilter,
* selects tasks that should be directly marked as failed without doing the aborting RPC.
* @param exceptionGenerator,
* generates an exception for tasks that are directly marked as failed.
*/
private void abortOngoingTaskClusters(ITaskFilter taskFilter, IExceptionGenerator exceptionGenerator) throws HyracksException {
for (ActivityCluster ac : jobRun.getActivityClusterGraph().getActivityClusterMap().values()) {
if (!isPlanned(ac)) {
continue;
}
TaskCluster[] taskClusters = getActivityClusterPlan(ac).getTaskClusters();
if (taskClusters == null) {
continue;
}
for (TaskCluster tc : taskClusters) {
TaskClusterAttempt lastTaskClusterAttempt = findLastTaskClusterAttempt(tc);
if (lastTaskClusterAttempt == null || !(lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.COMPLETED || lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.RUNNING)) {
continue;
}
boolean abort = false;
for (TaskAttempt ta : lastTaskClusterAttempt.getTaskAttempts().values()) {
assert ta.getStatus() == TaskAttempt.TaskStatus.COMPLETED || ta.getStatus() == TaskAttempt.TaskStatus.RUNNING;
if (taskFilter.directlyMarkAsFailed(ta)) {
// Directly mark it as fail, without further aborting.
ta.setStatus(TaskAttempt.TaskStatus.FAILED, Collections.singletonList(exceptionGenerator.getException(ta)));
ta.setEndTime(System.currentTimeMillis());
abort = true;
}
}
if (abort) {
abortTaskCluster(lastTaskClusterAttempt, TaskClusterAttempt.TaskClusterStatus.ABORTED);
}
}
abortDoomedTaskClusters();
}
}
Aggregations