use of org.apache.tez.dag.records.TezVertexID in project tez by apache.
the class TaskCommunicatorManager method heartbeat.
public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
if (LOG.isDebugEnabled()) {
LOG.debug("Received heartbeat from container" + ", request=" + request);
}
if (!registeredContainers.containsKey(containerId)) {
LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
return RESPONSE_SHOULD_DIE;
}
// A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
// meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
// the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
// know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
// So - avoiding synchronization.
pingContainerHeartbeatHandler(containerId);
TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
if (taskAttemptID != null) {
ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
// This can happen when a task heartbeats. Meanwhile the container is unregistered.
// The information will eventually make it through to the plugin via a corresponding unregister.
// There's a race in that case between the unregister making it through, and this method returning.
// TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
// so that the plugin can handle the scenario. Alternately augment the response with error codes.
// Error codes would be better than exceptions.
LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
return RESPONSE_SHOULD_DIE;
}
List<TezEvent> inEvents = request.getEvents();
if (LOG.isDebugEnabled()) {
LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
}
long currTime = context.getClock().getTime();
// taFinishedEvents - means the TaskAttemptFinishedEvent
// taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
// eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
// taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
// taFinishedEvents must be routed before taGeneratedEvents
List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
TaskAttemptEventStatusUpdate taskAttemptEvent = null;
boolean readErrorReported = false;
for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
// for now, set the event time on the AM when it is received.
// this avoids any time disparity between machines.
tezEvent.setEventReceivedTime(currTime);
final EventType eventType = tezEvent.getEventType();
if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
// send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
} else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
taFinishedEvents.add(tezEvent);
} else {
if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
readErrorReported = true;
}
if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
taGeneratedEvents.add(tezEvent);
}
eventsForVertex.add(tezEvent);
}
}
if (taskAttemptEvent != null) {
taskAttemptEvent.setReadErrorReported(readErrorReported);
sendEvent(taskAttemptEvent);
}
// route taGeneratedEvents to TaskAttempt
if (!taGeneratedEvents.isEmpty()) {
sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
}
// route events to TaskAttempt
Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
for (TezEvent e : taFinishedEvents) {
EventMetaData sourceMeta = e.getSourceInfo();
switch(e.getEventType()) {
case TASK_ATTEMPT_FAILED_EVENT:
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptTerminationCause errCause = null;
switch(sourceMeta.getEventGenerator()) {
case INPUT:
errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
break;
case PROCESSOR:
errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
break;
case OUTPUT:
errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
break;
case SYSTEM:
errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
break;
default:
throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
}
if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
} else {
// Killed
TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
}
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
break;
default:
throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
}
}
if (!eventsForVertex.isEmpty()) {
TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
}
taskHeartbeatHandler.pinged(taskAttemptID);
eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
}
return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
use of org.apache.tez.dag.records.TezVertexID in project tez by apache.
the class DAGImpl method vertexSucceeded.
private boolean vertexSucceeded(Vertex vertex) {
numSuccessfulVertices++;
boolean recoveryFailed = false;
if (!commitAllOutputsOnSuccess && isCommittable()) {
// committing successful outputs immediately. check for shared outputs
List<VertexGroupInfo> groupsList = vertexGroupInfo.get(vertex.getName());
if (groupsList != null) {
List<VertexGroupInfo> commitList = Lists.newArrayListWithCapacity(groupsList.size());
for (VertexGroupInfo groupInfo : groupsList) {
groupInfo.successfulMembers++;
if (groupInfo.groupMembers.size() == groupInfo.successfulMembers && !groupInfo.outputs.isEmpty()) {
// group has outputs and all vertex members are done
LOG.info("All members of group: " + groupInfo.groupName + " are succeeded. Commiting outputs");
commitList.add(groupInfo);
}
}
for (VertexGroupInfo groupInfo : commitList) {
if (recoveryData != null && recoveryData.isVertexGroupCommitted(groupInfo.groupName)) {
LOG.info("VertexGroup was already committed as per recovery" + " data, groupName=" + groupInfo.groupName);
for (String vertexName : groupInfo.groupMembers) {
VertexRecoveryData vertexRecoveryData = recoveryData.getVertexRecoveryData(getVertex(vertexName).getVertexId());
Preconditions.checkArgument(vertexRecoveryData != null, "Vertex Group has been committed" + ", but no VertexRecoveryData found for its vertex " + vertexName);
VertexFinishedEvent vertexFinishedEvent = vertexRecoveryData.getVertexFinishedEvent();
Preconditions.checkArgument(vertexFinishedEvent != null, "Vertex Group has been committed" + ", but no VertexFinishedEvent found in its vertex " + vertexName);
Preconditions.checkArgument(vertexFinishedEvent.getState() == VertexState.SUCCEEDED, "Vertex Group has been committed, but unexpected vertex state of its vertex " + vertexName + ", vertexstate=" + vertexFinishedEvent.getState());
}
continue;
}
groupInfo.commitStarted = true;
final Vertex v = getVertex(groupInfo.groupMembers.iterator().next());
try {
Collection<TezVertexID> vertexIds = getVertexIds(groupInfo.groupMembers);
appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitStartedEvent(dagId, groupInfo.groupName, vertexIds, clock.getTime())));
} catch (IOException e) {
LOG.error("Failed to send commit recovery event to handler", e);
recoveryFailed = true;
}
if (!recoveryFailed) {
for (final String outputName : groupInfo.outputs) {
OutputKey outputKey = new OutputKey(outputName, groupInfo.groupName, true);
CommitCallback groupCommitCallback = new CommitCallback(outputKey);
CallableEvent groupCommitCallableEvent = new CallableEvent(groupCommitCallback) {
public Void call() throws Exception {
OutputCommitter committer = v.getOutputCommitters().get(outputName);
LOG.info("Committing output: " + outputName);
commitOutput(committer);
return null;
}
};
ListenableFuture<Void> groupCommitFuture = appContext.getExecService().submit(groupCommitCallableEvent);
Futures.addCallback(groupCommitFuture, groupCommitCallableEvent.getCallback());
commitFutures.put(outputKey, groupCommitFuture);
}
}
}
}
}
if (recoveryFailed) {
LOG.info("Recovery failure occurred during commit");
enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.COMMIT_FAILURE);
}
return !recoveryFailed;
}
use of org.apache.tez.dag.records.TezVertexID in project tez by apache.
the class DAGImpl method commitCompleted.
private boolean commitCompleted(DAGEventCommitCompleted commitCompletedEvent) {
Preconditions.checkState(commitFutures.remove(commitCompletedEvent.getOutputKey()) != null, "Unknown commit:" + commitCompletedEvent.getOutputKey());
boolean commitFailed = false;
boolean recoveryFailed = false;
if (commitCompletedEvent.isSucceeded()) {
LOG.info("Commit succeeded for output:" + commitCompletedEvent.getOutputKey());
OutputKey outputKey = commitCompletedEvent.getOutputKey();
if (outputKey.isVertexGroupOutput) {
VertexGroupInfo vertexGroup = vertexGroups.get(outputKey.getEntityName());
vertexGroup.successfulCommits++;
if (vertexGroup.isCommitted()) {
if (!commitAllOutputsOnSuccess) {
try {
Collection<TezVertexID> vertexIds = getVertexIds(vertexGroup.groupMembers);
appContext.getHistoryHandler().handleCriticalEvent(new DAGHistoryEvent(getID(), new VertexGroupCommitFinishedEvent(getID(), commitCompletedEvent.getOutputKey().getEntityName(), vertexIds, clock.getTime())));
} catch (IOException e) {
String diag = "Failed to send commit recovery event to handler, " + ExceptionUtils.getStackTrace(e);
addDiagnostic(diag);
LOG.error(diag);
recoveryFailed = true;
}
}
}
}
} else {
String diag = "Commit failed for output: " + commitCompletedEvent.getOutputKey() + ", " + ExceptionUtils.getStackTrace(commitCompletedEvent.getException());
addDiagnostic(diag);
LOG.error(diag);
commitFailed = true;
}
if (commitFailed) {
enactKill(DAGTerminationCause.COMMIT_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
cancelCommits();
}
if (recoveryFailed) {
enactKill(DAGTerminationCause.RECOVERY_FAILURE, VertexTerminationCause.OTHER_VERTEX_FAILURE);
cancelCommits();
}
return !commitFailed && !recoveryFailed;
}
use of org.apache.tez.dag.records.TezVertexID in project tez by apache.
the class StateChangeNotifier method unregisterForTaskSuccessUpdates.
public void unregisterForTaskSuccessUpdates(String vertexName, TaskStateUpdateListener listener) {
TezVertexID vertexId = validateAndGetVertexId(vertexName);
Preconditions.checkNotNull(listener, "listener cannot be null");
taskWriteLock.lock();
try {
taskListeners.remove(vertexId, listener);
} finally {
taskWriteLock.unlock();
}
}
use of org.apache.tez.dag.records.TezVertexID in project tez by apache.
the class StateChangeNotifier method registerForVertexUpdates.
// -------------- VERTEX STATE CHANGE SECTION ---------------
public void registerForVertexUpdates(String vertexName, Set<org.apache.tez.dag.api.event.VertexState> stateSet, VertexStateUpdateListener listener) {
TezVertexID vertexId = validateAndGetVertexId(vertexName);
writeLock.lock();
// Read within the lock, to ensure a consistent view is seen.
try {
List<VertexStateUpdate> previousUpdates = lastKnowStatesMap.get(vertexId);
ListenerContainer listenerContainer = new ListenerContainer(listener, stateSet);
Set<ListenerContainer> listenerContainers = vertexListeners.get(vertexId);
if (listenerContainers == null || !listenerContainers.contains(listenerContainer)) {
vertexListeners.put(vertexId, listenerContainer);
// Sent from within the lock to avoid duplicate events, and out of order events.
if (previousUpdates != null && !previousUpdates.isEmpty()) {
for (VertexStateUpdate update : previousUpdates) {
listenerContainer.sendStateUpdate(update);
}
}
} else {
// Disallow multiple register calls.
throw new TezUncheckedException("Only allowed to register once for a listener. CurrentContext: vertexName=" + vertexName + ", Listener: " + listener);
}
} finally {
writeLock.unlock();
}
}
Aggregations