use of org.apache.hyracks.api.dataflow.TaskAttemptId in project asterixdb by apache.
the class CCNCFunctions method readTaskAttemptId.
private static TaskAttemptId readTaskAttemptId(DataInputStream dis) throws IOException {
int odid = dis.readInt();
int aid = dis.readInt();
int partition = dis.readInt();
int attempt = dis.readInt();
TaskAttemptId taId = new TaskAttemptId(new TaskId(new ActivityId(new OperatorDescriptorId(odid), aid), partition), attempt);
return taId;
}
use of org.apache.hyracks.api.dataflow.TaskAttemptId in project asterixdb by apache.
the class AbortTasksWork method run.
@Override
public void run() {
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("Aborting Tasks: " + jobId + ":" + tasks);
}
IDatasetPartitionManager dpm = ncs.getDatasetPartitionManager();
if (dpm != null) {
ncs.getDatasetPartitionManager().abortReader(jobId);
}
Joblet ji = ncs.getJobletMap().get(jobId);
if (ji != null) {
Map<TaskAttemptId, Task> taskMap = ji.getTaskMap();
for (TaskAttemptId taId : tasks) {
Task task = taskMap.get(taId);
if (task != null) {
task.abort();
}
}
} else {
LOGGER.log(Level.WARNING, "Joblet couldn't be found. Tasks of job " + jobId + " have all either completed or failed");
}
}
use of org.apache.hyracks.api.dataflow.TaskAttemptId in project asterixdb by apache.
the class JobExecutor method abortTaskCluster.
private void abortTaskCluster(TaskClusterAttempt tcAttempt, TaskClusterAttempt.TaskClusterStatus failedOrAbortedStatus) {
LOGGER.fine("Aborting task cluster: " + tcAttempt.getAttempt());
Set<TaskAttemptId> abortTaskIds = new HashSet<>();
Map<String, List<TaskAttemptId>> abortTaskAttemptMap = new HashMap<>();
for (TaskAttempt ta : tcAttempt.getTaskAttempts().values()) {
TaskAttemptId taId = ta.getTaskAttemptId();
TaskAttempt.TaskStatus status = ta.getStatus();
abortTaskIds.add(taId);
LOGGER.fine("Checking " + taId + ": " + ta.getStatus());
if (status == TaskAttempt.TaskStatus.RUNNING || status == TaskAttempt.TaskStatus.COMPLETED) {
ta.setStatus(TaskAttempt.TaskStatus.ABORTED, null);
ta.setEndTime(System.currentTimeMillis());
List<TaskAttemptId> abortTaskAttempts = abortTaskAttemptMap.get(ta.getNodeId());
if (status == TaskAttempt.TaskStatus.RUNNING && abortTaskAttempts == null) {
abortTaskAttempts = new ArrayList<>();
abortTaskAttemptMap.put(ta.getNodeId(), abortTaskAttempts);
}
if (status == TaskAttempt.TaskStatus.RUNNING) {
abortTaskAttempts.add(taId);
}
}
}
final JobId jobId = jobRun.getJobId();
LOGGER.fine("Abort map for job: " + jobId + ": " + abortTaskAttemptMap);
INodeManager nodeManager = ccs.getNodeManager();
for (Map.Entry<String, List<TaskAttemptId>> entry : abortTaskAttemptMap.entrySet()) {
final NodeControllerState node = nodeManager.getNodeControllerState(entry.getKey());
final List<TaskAttemptId> abortTaskAttempts = entry.getValue();
if (node != null) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("Aborting: " + abortTaskAttempts + " at " + entry.getKey());
}
try {
node.getNodeController().abortTasks(jobId, abortTaskAttempts);
} catch (Exception e) {
LOGGER.log(Level.SEVERE, e.getMessage(), e);
}
}
}
inProgressTaskClusters.remove(tcAttempt.getTaskCluster());
TaskCluster tc = tcAttempt.getTaskCluster();
PartitionMatchMaker pmm = jobRun.getPartitionMatchMaker();
pmm.removeUncommittedPartitions(tc.getProducedPartitions(), abortTaskIds);
pmm.removePartitionRequests(tc.getRequiredPartitions(), abortTaskIds);
tcAttempt.setStatus(failedOrAbortedStatus);
tcAttempt.setEndTime(System.currentTimeMillis());
}
use of org.apache.hyracks.api.dataflow.TaskAttemptId in project asterixdb by apache.
the class JobExecutor method notifyTaskFailure.
/**
* Indicates that a single task attempt has encountered a failure.
* @param ta Failed Task Attempt
* @param exceptions exeptions thrown during the failure
*/
public void notifyTaskFailure(TaskAttempt ta, List<Exception> exceptions) {
try {
LOGGER.fine("Received failure notification for TaskAttempt " + ta.getTaskAttemptId());
TaskAttemptId taId = ta.getTaskAttemptId();
TaskCluster tc = ta.getTask().getTaskCluster();
TaskClusterAttempt lastAttempt = findLastTaskClusterAttempt(tc);
if (lastAttempt != null && taId.getAttempt() == lastAttempt.getAttempt()) {
LOGGER.fine("Marking TaskAttempt " + ta.getTaskAttemptId() + " as failed");
ta.setStatus(TaskAttempt.TaskStatus.FAILED, exceptions);
abortTaskCluster(lastAttempt, TaskClusterAttempt.TaskClusterStatus.FAILED);
abortDoomedTaskClusters();
if (lastAttempt.getAttempt() >= jobRun.getActivityClusterGraph().getMaxReattempts() || isCancelled()) {
abortJob(exceptions);
return;
}
startRunnableActivityClusters();
} else {
LOGGER.warning("Ignoring task failure notification: " + taId + " -- Current last attempt = " + lastAttempt);
}
} catch (Exception e) {
abortJob(Collections.singletonList(e));
}
}
use of org.apache.hyracks.api.dataflow.TaskAttemptId in project asterixdb by apache.
the class StartTasksWork method run.
@Override
public void run() {
Task task = null;
try {
NCServiceContext serviceCtx = ncs.getContext();
Joblet joblet = getOrCreateLocalJoblet(deploymentId, jobId, serviceCtx, acgBytes);
final ActivityClusterGraph acg = joblet.getActivityClusterGraph();
IRecordDescriptorProvider rdp = new IRecordDescriptorProvider() {
@Override
public RecordDescriptor getOutputRecordDescriptor(ActivityId aid, int outputIndex) {
ActivityCluster ac = acg.getActivityMap().get(aid);
IConnectorDescriptor conn = ac.getActivityOutputMap().get(aid).get(outputIndex);
return ac.getConnectorRecordDescriptorMap().get(conn.getConnectorId());
}
@Override
public RecordDescriptor getInputRecordDescriptor(ActivityId aid, int inputIndex) {
ActivityCluster ac = acg.getActivityMap().get(aid);
IConnectorDescriptor conn = ac.getActivityInputMap().get(aid).get(inputIndex);
return ac.getConnectorRecordDescriptorMap().get(conn.getConnectorId());
}
};
for (TaskAttemptDescriptor td : taskDescriptors) {
TaskAttemptId taId = td.getTaskAttemptId();
TaskId tid = taId.getTaskId();
ActivityId aid = tid.getActivityId();
ActivityCluster ac = acg.getActivityMap().get(aid);
IActivity han = ac.getActivityMap().get(aid);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("Initializing " + taId + " -> " + han);
}
final int partition = tid.getPartition();
List<IConnectorDescriptor> inputs = ac.getActivityInputMap().get(aid);
task = new Task(joblet, taId, han.getClass().getName(), ncs.getExecutor(), ncs, createInputChannels(td, inputs));
IOperatorNodePushable operator = han.createPushRuntime(task, rdp, partition, td.getPartitionCount());
List<IPartitionCollector> collectors = new ArrayList<>();
if (inputs != null) {
for (int i = 0; i < inputs.size(); ++i) {
IConnectorDescriptor conn = inputs.get(i);
IConnectorPolicy cPolicy = connectorPoliciesMap.get(conn.getConnectorId());
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("input: " + i + ": " + conn.getConnectorId());
}
RecordDescriptor recordDesc = ac.getConnectorRecordDescriptorMap().get(conn.getConnectorId());
IPartitionCollector collector = createPartitionCollector(td, partition, task, i, conn, recordDesc, cPolicy);
collectors.add(collector);
}
}
List<IConnectorDescriptor> outputs = ac.getActivityOutputMap().get(aid);
if (outputs != null) {
for (int i = 0; i < outputs.size(); ++i) {
final IConnectorDescriptor conn = outputs.get(i);
RecordDescriptor recordDesc = ac.getConnectorRecordDescriptorMap().get(conn.getConnectorId());
IConnectorPolicy cPolicy = connectorPoliciesMap.get(conn.getConnectorId());
IPartitionWriterFactory pwFactory = createPartitionWriterFactory(task, cPolicy, jobId, conn, partition, taId, flags);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("output: " + i + ": " + conn.getConnectorId());
}
IFrameWriter writer = conn.createPartitioner(task, recordDesc, pwFactory, partition, td.getPartitionCount(), td.getOutputPartitionCounts()[i]);
operator.setOutputFrameWriter(i, writer, recordDesc);
}
}
task.setTaskRuntime(collectors.toArray(new IPartitionCollector[collectors.size()]), operator);
joblet.addTask(task);
task.start();
}
} catch (Exception e) {
LOGGER.log(Level.WARNING, "Failure starting a task", e);
// notify cc of start task failure
List<Exception> exceptions = new ArrayList<>();
ExceptionUtils.setNodeIds(exceptions, ncs.getId());
ncs.getWorkQueue().schedule(new NotifyTaskFailureWork(ncs, task, exceptions));
}
}
Aggregations