use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.
the class TaskSchedulerManager method getProgress.
// Not synchronized to avoid deadlocks from TaskScheduler callbacks.
// TaskScheduler uses a separate thread for it's callbacks. Since this method
// returns a value which is required, the TaskScheduler wait for the call to
// complete and can hence lead to a deadlock if called from within a TSEH lock.
public float getProgress(int schedulerId) {
// at this point allocate has been called and so node count must be available
// may change after YARN-1722
// This is a heartbeat in from the scheduler into the APP, and is being used to piggy-back and
// node updates from the cluster.
// Doubles as a mechanism to update node counts periodically. Hence schedulerId required.
// TODO Handle this in TEZ-2124. Need a way to know which scheduler is calling in.
int nodeCount = 0;
try {
nodeCount = taskSchedulers[0].getClusterNodeCount();
} catch (Exception e) {
String msg = "Error in TaskScheduler while getting node count" + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(schedulerId, appContext);
LOG.error(msg, e);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
throw new RuntimeException(e);
}
if (nodeCount != cachedNodeCount) {
cachedNodeCount = nodeCount;
sendEvent(new AMNodeEventNodeCountUpdated(cachedNodeCount, schedulerId));
}
return dagAppMaster.getProgress();
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.
the class TaskSchedulerManager method preemptContainer.
public void preemptContainer(int schedulerId, ContainerId containerId) {
// TODO Why is this making a call back into the scheduler, when the call is originating from there.
// An AMContainer instance should already exist if an attempt is being made to preempt it
AMContainer amContainer = appContext.getAllContainers().get(containerId);
try {
taskSchedulers[amContainer.getTaskSchedulerIdentifier()].deallocateContainer(containerId);
} catch (Exception e) {
String msg = "Error in TaskScheduler when preempting container" + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(amContainer.getTaskSchedulerIdentifier(), appContext) + ", containerId=" + containerId;
LOG.error(msg, e);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
}
// Inform the Containers about completion.
sendEvent(new AMContainerEventCompleted(containerId, ContainerExitStatus.INVALID, "Container preempted internally", TaskAttemptTerminationCause.INTERNAL_PREEMPTION));
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.
the class TaskSchedulerManager method handleTAUnsuccessfulEnd.
private void handleTAUnsuccessfulEnd(AMSchedulerEventTAEnded event) {
TaskAttempt attempt = event.getAttempt();
// Propagate state and failure cause (if any) when informing the scheduler about the de-allocation.
boolean wasContainerAllocated = false;
try {
wasContainerAllocated = taskSchedulers[event.getSchedulerId()].deallocateTask(attempt, false, event.getTaskAttemptEndReason(), event.getDiagnostics());
} catch (Exception e) {
String msg = "Error in TaskScheduler for handling Task De-allocation" + ", eventType=" + event.getType() + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(event.getSchedulerId(), appContext) + ", taskAttemptId=" + attempt.getID();
LOG.error(msg, e);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, e));
return;
}
// use stored value of container id in case the scheduler has removed this
// assignment because the task has been deallocated earlier.
// retroactive case
ContainerId attemptContainerId = attempt.getAssignedContainerID();
if (!wasContainerAllocated) {
LOG.info("Task: " + attempt.getID() + " has no container assignment in the scheduler");
if (attemptContainerId != null) {
LOG.error("No container allocated to task: " + attempt.getID() + " according to scheduler. Task reported container id: " + attemptContainerId);
}
}
if (attemptContainerId != null) {
// TODO either ways send the necessary events
// Ask the container to stop.
sendEvent(new AMContainerEventStopRequest(attemptContainerId));
// Inform the Node - the task has asked to be STOPPED / has already
// stopped.
// AMNodeImpl blacklisting logic does not account for KILLED attempts.
sendEvent(new AMNodeEventTaskAttemptEnded(appContext.getAllContainers().get(attemptContainerId).getContainer().getNodeId(), event.getSchedulerId(), attemptContainerId, attempt.getID(), event.getState() == TaskAttemptState.FAILED));
}
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.
the class TaskSchedulerManager method reportError.
public void reportError(int taskSchedulerIndex, ServicePluginError servicePluginError, String diagnostics, DagInfo dagInfo) {
if (servicePluginError == YarnTaskSchedulerServiceError.RESOURCEMANAGER_ERROR) {
LOG.info("Error reported by scheduler {} - {}", Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext) + ": " + diagnostics);
if (taskSchedulerDescriptors[taskSchedulerIndex].getClassName().equals(yarnSchedulerClassName)) {
LOG.warn("Reporting a SchedulerServiceError to the DAGAppMaster since the error" + " was reported by the YARN task scheduler");
sendEvent(new DAGAppMasterEventSchedulingServiceError(diagnostics));
}
} else if (servicePluginError.getErrorType() == ServicePluginError.ErrorType.PERMANENT) {
String msg = "Fatal error reported by TaskScheduler" + ", scheduler=" + Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext) + ", servicePluginError=" + servicePluginError + ", diagnostics= " + (diagnostics == null ? "" : diagnostics);
LOG.error(msg);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_SCHEDULER_SERVICE_FATAL_ERROR, msg, null));
} else {
Utils.processNonFatalServiceErrorReport(Utils.getTaskSchedulerIdentifierString(taskSchedulerIndex, appContext), servicePluginError, diagnostics, dagInfo, appContext, "TaskScheduler");
}
}
use of org.apache.tez.dag.app.dag.event.DAGAppMasterEventUserServiceFatalError in project tez by apache.
the class TaskCommunicatorManager method reportError.
@Override
public void reportError(int taskCommIndex, ServicePluginError servicePluginError, String diagnostics, DagInfo dagInfo) {
if (servicePluginError.getErrorType() == ServicePluginError.ErrorType.PERMANENT) {
String msg = "Fatal Error reported by TaskCommunicator" + ", communicator=" + Utils.getTaskCommIdentifierString(taskCommIndex, context) + ", servicePluginError=" + servicePluginError + ", diagnostics= " + (diagnostics == null ? "" : diagnostics);
LOG.error(msg + ", Diagnostics=" + diagnostics);
sendEvent(new DAGAppMasterEventUserServiceFatalError(DAGAppMasterEventType.TASK_COMMUNICATOR_SERVICE_FATAL_ERROR, msg, null));
} else {
Utils.processNonFatalServiceErrorReport(Utils.getTaskCommIdentifierString(taskCommIndex, context), servicePluginError, diagnostics, dagInfo, context, "TaskCommunicator");
}
}
Aggregations