use of org.apache.hyracks.control.cc.job.TaskClusterAttempt in project asterixdb by apache.
the class JobExecutor method notifyTaskFailure.
/**
* Indicates that a single task attempt has encountered a failure.
* @param ta Failed Task Attempt
* @param exceptions exeptions thrown during the failure
*/
public void notifyTaskFailure(TaskAttempt ta, List<Exception> exceptions) {
try {
LOGGER.fine("Received failure notification for TaskAttempt " + ta.getTaskAttemptId());
TaskAttemptId taId = ta.getTaskAttemptId();
TaskCluster tc = ta.getTask().getTaskCluster();
TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
if (lastAttempt != null && taId.getAttempt() == lastAttempt.getAttempt()) {
LOGGER.fine("Marking TaskAttempt " + ta.getTaskAttemptId() + " as failed");
ta.setStatus(TaskAttempt.TaskStatus.FAILED, exceptions);
abortTaskCluster(lastAttempt, TaskClusterAttempt.TaskClusterStatus.FAILED);
abortDoomedTaskClusters();
if (lastAttempt.getAttempt() >= jobRun.getActivityClusterGraph().getMaxReattempts() || isCancelled()) {
abortJob(exceptions);
return;
}
startRunnableActivityClusters();
} else {
LOGGER.warning("Ignoring task failure notification: " + taId + " -- Current last attempt = " + lastAttempt);
}
} catch (Exception e) {
abortJob(Collections.singletonList(e));
}
}
use of org.apache.hyracks.control.cc.job.TaskClusterAttempt in project asterixdb by apache.
the class JobExecutor method startRunnableTaskClusters.
private void startRunnableTaskClusters(Set<TaskCluster> tcRoots) throws HyracksException {
Map<TaskCluster, Runnability> runnabilityMap = new HashMap<>();
for (TaskCluster tc : tcRoots) {
assignRunnabilityRank(tc, runnabilityMap);
}
PriorityQueue<RankedRunnableTaskCluster> queue = new PriorityQueue<>();
for (Map.Entry<TaskCluster, Runnability> e : runnabilityMap.entrySet()) {
TaskCluster tc = e.getKey();
Runnability runnability = e.getValue();
if (runnability.getTag() != Runnability.Tag.RUNNABLE) {
continue;
}
int priority = runnability.getPriority();
if (priority >= 0 && priority < Integer.MAX_VALUE) {
queue.add(new RankedRunnableTaskCluster(priority, tc));
}
}
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Ranked TCs: " + queue);
}
Map<String, List<TaskAttemptDescriptor>> taskAttemptMap = new HashMap<>();
for (RankedRunnableTaskCluster rrtc : queue) {
TaskCluster tc = rrtc.getTaskCluster();
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Found runnable TC: " + tc);
List<TaskClusterAttempt> attempts = tc.getAttempts();
LOGGER.fine("Attempts so far:" + attempts.size());
for (TaskClusterAttempt tcAttempt : attempts) {
LOGGER.fine("Status: " + tcAttempt.getStatus());
}
}
assignTaskLocations(tc, taskAttemptMap);
}
if (taskAttemptMap.isEmpty()) {
return;
}
startTasks(taskAttemptMap);
}
use of org.apache.hyracks.control.cc.job.TaskClusterAttempt in project asterixdb by apache.
the class JobExecutor method findDoomedTaskClusters.
private boolean findDoomedTaskClusters(TaskCluster tc, Set<TaskCluster> doomedTaskClusters) {
if (doomedTaskClusters.contains(tc)) {
return true;
}
TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
if (lastAttempt != null) {
switch(lastAttempt.getStatus()) {
case ABORTED:
case FAILED:
case COMPLETED:
return false;
default:
break;
}
}
Map<ConnectorDescriptorId, IConnectorPolicy> connectorPolicyMap = jobRun.getConnectorPolicyMap();
PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
boolean doomed = false;
for (TaskCluster depTC : tc.getDependencyTaskClusters()) {
if (findDoomedTaskClusters(depTC, doomedTaskClusters)) {
doomed = true;
}
}
for (PartitionId pid : tc.getRequiredPartitions()) {
ConnectorDescriptorId cdId = pid.getConnectorDescriptorId();
IConnectorPolicy cPolicy = connectorPolicyMap.get(cdId);
PartitionState maxState = pmm.getMaximumAvailableState(pid);
if ((maxState == null || (cPolicy.consumerWaitsForProducerToFinish() && maxState != PartitionState.COMMITTED)) && findDoomedTaskClusters(partitionProducingTaskClusterMap.get(pid), doomedTaskClusters)) {
doomed = true;
}
}
if (doomed) {
doomedTaskClusters.add(tc);
}
return doomed;
}
use of org.apache.hyracks.control.cc.job.TaskClusterAttempt in project asterixdb by apache.
the class JobExecutor method abortOngoingTaskClusters.
/**
* Aborts ongoing task clusters.
*
* @param taskFilter,
* selects tasks that should be directly marked as failed without doing the aborting RPC.
* @param exceptionGenerator,
* generates an exception for tasks that are directly marked as failed.
*/
private void abortOngoingTaskClusters(ITaskFilter taskFilter, IExceptionGenerator exceptionGenerator) throws HyracksException {
for (ActivityCluster ac : jobRun.getActivityClusterGraph().getActivityClusterMap().values()) {
if (!isPlanned(ac)) {
continue;
}
TaskCluster[] taskClusters = getActivityClusterPlan(ac).getTaskClusters();
if (taskClusters == null) {
continue;
}
for (TaskCluster tc : taskClusters) {
TaskClusterAttempt lastTaskClusterAttempt = findLastTaskClusterAttempt(tc);
if (lastTaskClusterAttempt == null || !(lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.COMPLETED || lastTaskClusterAttempt.getStatus() == TaskClusterAttempt.TaskClusterStatus.RUNNING)) {
continue;
}
boolean abort = false;
for (TaskAttempt ta : lastTaskClusterAttempt.getTaskAttempts().values()) {
assert ta.getStatus() == TaskAttempt.TaskStatus.COMPLETED || ta.getStatus() == TaskAttempt.TaskStatus.RUNNING;
if (taskFilter.directlyMarkAsFailed(ta)) {
// Directly mark it as fail, without further aborting.
ta.setStatus(TaskAttempt.TaskStatus.FAILED, Collections.singletonList(exceptionGenerator.getException(ta)));
ta.setEndTime(System.currentTimeMillis());
abort = true;
}
}
if (abort) {
abortTaskCluster(lastTaskClusterAttempt, TaskClusterAttempt.TaskClusterStatus.ABORTED);
}
}
abortDoomedTaskClusters();
}
}
use of org.apache.hyracks.control.cc.job.TaskClusterAttempt in project asterixdb by apache.
the class JobExecutor method findTaskLocation.
private String findTaskLocation(TaskId tid) {
ActivityId aid = tid.getActivityId();
ActivityCluster ac = jobRun.getActivityClusterGraph().getActivityMap().get(aid);
Task[] tasks = getActivityClusterPlan(ac).getActivityPlanMap().get(aid).getTasks();
List<TaskClusterAttempt> tcAttempts = tasks[tid.getPartition()].getTaskCluster().getAttempts();
if (tcAttempts == null || tcAttempts.isEmpty()) {
return null;
}
TaskClusterAttempt lastTCA = tcAttempts.get(tcAttempts.size() - 1);
TaskAttempt ta = lastTCA.getTaskAttempts().get(tid);
return ta == null ? null : ta.getNodeId();
}
Aggregations