use of org.apache.tez.dag.app.dag.event.VertexEventRouteEvent in project tez by apache.
the class VertexImpl method checkTasksForCompletion.
// triggered by task_complete
static VertexState checkTasksForCompletion(final VertexImpl vertex) {
// this log helps quickly count the completion count for a vertex.
// grepping and counting for attempts and handling re-tries is time consuming
LOG.info("Task Completion: " + constructCheckTasksForCompletionLog(vertex));
// check for vertex failure first
if (vertex.completedTaskCount > vertex.tasks.size()) {
LOG.error("task completion accounting issue: completedTaskCount > nTasks:" + constructCheckTasksForCompletionLog(vertex));
}
if (vertex.completedTaskCount == vertex.tasks.size()) {
// finished - gather stats
vertex.finalStatistics = vertex.constructStatistics();
// Only succeed if tasks complete successfully and no terminationCause is registered or if failures are below configured threshold.
boolean vertexSucceeded = vertex.succeededTaskCount == vertex.numTasks;
boolean vertexFailuresBelowThreshold = (vertex.succeededTaskCount + vertex.failedTaskCount == vertex.numTasks) && (vertex.failedTaskCount * 100 <= vertex.maxFailuresPercent * vertex.numTasks);
if ((vertexSucceeded || vertexFailuresBelowThreshold) && vertex.terminationCause == null) {
if (vertexSucceeded) {
LOG.info("All tasks have succeeded, vertex:" + vertex.logIdentifier);
} else {
LOG.info("All tasks in the vertex " + vertex.logIdentifier + " have completed and the percentage of failed tasks (failed/total) (" + vertex.failedTaskCount + "/" + vertex.numTasks + ") is less that the threshold of " + vertex.maxFailuresPercent);
vertex.addDiagnostic("Vertex succeeded as percentage of failed tasks (failed/total) (" + vertex.failedTaskCount + "/" + vertex.numTasks + ") is less that the threshold of " + vertex.maxFailuresPercent);
vertex.logSuccessDiagnostics = true;
for (Task task : vertex.tasks.values()) {
if (!task.getState().equals(TaskState.FAILED)) {
continue;
}
// Find the last attempt and mark that as successful
Iterator<TezTaskAttemptID> attempts = task.getAttempts().keySet().iterator();
TezTaskAttemptID lastAttempt = null;
while (attempts.hasNext()) {
TezTaskAttemptID attempt = attempts.next();
if (lastAttempt == null || attempt.getId() > lastAttempt.getId()) {
lastAttempt = attempt;
}
}
LOG.info("Succeeding failed task attempt:" + lastAttempt);
for (Map.Entry<Vertex, Edge> vertexEdge : vertex.targetVertices.entrySet()) {
Vertex destVertex = vertexEdge.getKey();
Edge edge = vertexEdge.getValue();
try {
List<TezEvent> tezEvents = edge.generateEmptyEventsForAttempt(lastAttempt);
// Downstream vertices need to receive a SUCCEEDED completion event for each failed task to ensure num bipartite count is correct
VertexEventTaskAttemptCompleted completionEvent = new VertexEventTaskAttemptCompleted(lastAttempt, TaskAttemptStateInternal.SUCCEEDED);
// Notify all target vertices
vertex.eventHandler.handle(new VertexEventSourceTaskAttemptCompleted(destVertex.getVertexId(), completionEvent));
vertex.eventHandler.handle(new VertexEventRouteEvent(destVertex.getVertexId(), tezEvents));
} catch (Exception e) {
throw new TezUncheckedException(e);
}
}
}
}
if (vertex.commitVertexOutputs && !vertex.committed.getAndSet(true)) {
// start commit if there're commits or just finish if no commits
return commitOrFinish(vertex);
} else {
// just finish because no vertex committing needed
return vertex.finished(VertexState.SUCCEEDED);
}
}
return finishWithTerminationCause(vertex);
}
// return the current state, Vertex not finished yet
return vertex.getInternalState();
}
use of org.apache.tez.dag.app.dag.event.VertexEventRouteEvent in project tez by apache.
the class TaskCommunicatorManager method heartbeat.
public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
if (LOG.isDebugEnabled()) {
LOG.debug("Received heartbeat from container" + ", request=" + request);
}
if (!registeredContainers.containsKey(containerId)) {
LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
return RESPONSE_SHOULD_DIE;
}
// A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
// meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
// the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
// know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
// So - avoiding synchronization.
pingContainerHeartbeatHandler(containerId);
TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
if (taskAttemptID != null) {
ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
// This can happen when a task heartbeats. Meanwhile the container is unregistered.
// The information will eventually make it through to the plugin via a corresponding unregister.
// There's a race in that case between the unregister making it through, and this method returning.
// TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
// so that the plugin can handle the scenario. Alternately augment the response with error codes.
// Error codes would be better than exceptions.
LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
return RESPONSE_SHOULD_DIE;
}
List<TezEvent> inEvents = request.getEvents();
if (LOG.isDebugEnabled()) {
LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
}
long currTime = context.getClock().getTime();
// taFinishedEvents - means the TaskAttemptFinishedEvent
// taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
// eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
// taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
// taFinishedEvents must be routed before taGeneratedEvents
List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
TaskAttemptEventStatusUpdate taskAttemptEvent = null;
boolean readErrorReported = false;
for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
// for now, set the event time on the AM when it is received.
// this avoids any time disparity between machines.
tezEvent.setEventReceivedTime(currTime);
final EventType eventType = tezEvent.getEventType();
if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
// send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
} else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
taFinishedEvents.add(tezEvent);
} else {
if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
readErrorReported = true;
}
if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
taGeneratedEvents.add(tezEvent);
}
eventsForVertex.add(tezEvent);
}
}
if (taskAttemptEvent != null) {
taskAttemptEvent.setReadErrorReported(readErrorReported);
sendEvent(taskAttemptEvent);
}
// route taGeneratedEvents to TaskAttempt
if (!taGeneratedEvents.isEmpty()) {
sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
}
// route events to TaskAttempt
Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
for (TezEvent e : taFinishedEvents) {
EventMetaData sourceMeta = e.getSourceInfo();
switch(e.getEventType()) {
case TASK_ATTEMPT_FAILED_EVENT:
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptTerminationCause errCause = null;
switch(sourceMeta.getEventGenerator()) {
case INPUT:
errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
break;
case PROCESSOR:
errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
break;
case OUTPUT:
errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
break;
case SYSTEM:
errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
break;
default:
throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
}
if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
} else {
// Killed
TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
}
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
break;
default:
throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
}
}
if (!eventsForVertex.isEmpty()) {
TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
}
taskHeartbeatHandler.pinged(taskAttemptID);
eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
}
return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
use of org.apache.tez.dag.app.dag.event.VertexEventRouteEvent in project tez by apache.
the class TestVertexImpl method testExceptionFromII_HandleInputInitializerEvent.
@Test(timeout = 5000)
public void testExceptionFromII_HandleInputInitializerEvent() throws Exception {
useCustomInitializer = true;
customInitializer = new EventHandlingRootInputInitializer(null, IIExceptionLocation.HandleInputInitializerEvent);
EventHandlingRootInputInitializer initializer = (EventHandlingRootInputInitializer) customInitializer;
setupPreDagCreation();
dagPlan = createDAGPlanWithRunningInitializer();
setupPostDagCreation();
VertexImplWithRunningInputInitializer v1 = (VertexImplWithRunningInputInitializer) vertices.get("vertex1");
VertexImplWithRunningInputInitializer v2 = (VertexImplWithRunningInputInitializer) vertices.get("vertex2");
initVertex(v1);
startVertex(v1);
Assert.assertEquals(VertexState.RUNNING, v1.getState());
Assert.assertEquals(VertexState.INITIALIZING, v2.getState());
dispatcher.await();
// Wait for the initializer to be invoked - which may be a separate thread.
while (!initializer.initStarted.get()) {
Thread.sleep(10);
}
Assert.assertFalse(initializer.eventReceived.get());
Assert.assertFalse(initializer.initComplete.get());
// Signal the initializer by sending an event - via vertex1
InputInitializerEvent event = InputInitializerEvent.create("vertex2", "input1", null);
// Create taskId and taskAttemptId for the single task that exists in vertex1
TezTaskID t0_v1 = TezTaskID.getInstance(v1.getVertexId(), 0);
TezTaskAttemptID ta0_t0_v1 = TezTaskAttemptID.getInstance(t0_v1, 0);
TezEvent tezEvent = new TezEvent(event, new EventMetaData(EventProducerConsumerType.OUTPUT, "vertex1", "vertex2", ta0_t0_v1));
// at least one task attempt is succeed, otherwise input initialize events won't been handled.
dispatcher.getEventHandler().handle(new TaskEvent(t0_v1, TaskEventType.T_ATTEMPT_LAUNCHED));
dispatcher.getEventHandler().handle(new TaskEventTASucceeded(ta0_t0_v1));
dispatcher.getEventHandler().handle(new VertexEventRouteEvent(v1.getVertexId(), Collections.singletonList(tezEvent)));
dispatcher.await();
// it would cause v2 fail as its II throw exception in handleInputInitializerEvent
String diagnostics = StringUtils.join(v2.getDiagnostics(), ",");
assertTrue(diagnostics.contains(IIExceptionLocation.HandleInputInitializerEvent.name()));
Assert.assertEquals(VertexState.FAILED, v2.getState());
Assert.assertEquals(VertexTerminationCause.ROOT_INPUT_INIT_FAILURE, v2.getTerminationCause());
}
use of org.apache.tez.dag.app.dag.event.VertexEventRouteEvent in project tez by apache.
the class TestVertexImpl method sendVertexEventRouteEvent.
private void sendVertexEventRouteEvent(Vertex sourceVertex, TezEvent... tezEvents) {
dispatcher.getEventHandler().handle(new VertexEventRouteEvent(sourceVertex.getVertexId(), Arrays.asList(tezEvents)));
dispatcher.await();
}
use of org.apache.tez.dag.app.dag.event.VertexEventRouteEvent in project tez by apache.
the class TestVertexImpl method testVertexGetTAAttemptsObsoletionWithPendingRoutes.
@Test(timeout = 5000)
public void testVertexGetTAAttemptsObsoletionWithPendingRoutes() throws Exception {
initAllVertices(VertexState.INITED);
VertexImpl v1 = vertices.get("vertex1");
startVertex(v1);
VertexImpl v2 = vertices.get("vertex2");
startVertex(v2);
VertexImpl v3 = vertices.get("vertex3");
VertexImpl v4 = vertices.get("vertex4");
List<ScheduleTaskRequest> taskList = new LinkedList<VertexManagerPluginContext.ScheduleTaskRequest>();
// scheduling start to trigger edge routing to begin
for (int i = 0; i < v4.getTotalTasks(); ++i) {
taskList.add(ScheduleTaskRequest.create(i, null));
}
v4.scheduleTasks(taskList);
Assert.assertEquals(VertexState.RUNNING, v4.getState());
Assert.assertEquals(1, v4.sourceVertices.size());
Edge e = v4.sourceVertices.get(v3);
TezTaskAttemptID v3TaId = TezTaskAttemptID.getInstance(TezTaskID.getInstance(v3.getVertexId(), 0), 0);
TezTaskAttemptID v4TaId = TezTaskAttemptID.getInstance(TezTaskID.getInstance(v4.getVertexId(), 0), 0);
for (int i = 0; i < 11; ++i) {
v4.handle(new VertexEventRouteEvent(v4.getVertexId(), Collections.singletonList(new TezEvent(DataMovementEvent.create(0, null), new EventMetaData(EventProducerConsumerType.OUTPUT, v3.getName(), v3.getName(), v3TaId)))));
}
dispatcher.await();
// verify all events have been are in taskEvents
Assert.assertEquals(11, v4.getOnDemandRouteEvents().size());
TaskAttemptEventInfo eventInfo;
EdgeManagerPluginOnDemand mockPlugin = mock(EdgeManagerPluginOnDemand.class);
EventRouteMetadata mockFailedRoute = EventRouteMetadata.create(1, new int[] { 0 });
e.edgeManager = mockPlugin;
when(mockPlugin.routeInputSourceTaskFailedEventToDestination(anyInt(), anyInt())).thenReturn(mockFailedRoute);
// return more events that dont evenly fit in max size
EventRouteMetadata mockRoute = EventRouteMetadata.create(2, new int[] { 0, 0 });
when(mockPlugin.routeDataMovementEventToDestination(anyInt(), anyInt(), anyInt())).thenReturn(mockRoute);
int fromEventId = 0;
eventInfo = v4.getTaskAttemptTezEvents(v4TaId, fromEventId, 0, 5);
fromEventId = eventInfo.getNextFromEventId();
// 0-1 events expanded and fit, 2nd event has pending routes
Assert.assertEquals(2, fromEventId);
Assert.assertEquals(5, eventInfo.getEvents().size());
// send an input failed event
v4.handle(new VertexEventRouteEvent(v4.getVertexId(), Collections.singletonList(new TezEvent(InputFailedEvent.create(0, 0), new EventMetaData(EventProducerConsumerType.OUTPUT, v3.getName(), v3.getName(), v3TaId)))));
// get only input failed event. all DM events obsoleted
eventInfo = v4.getTaskAttemptTezEvents(v4TaId, fromEventId, 0, 5);
fromEventId = eventInfo.getNextFromEventId();
Assert.assertEquals(12, fromEventId);
Assert.assertEquals(1, eventInfo.getEvents().size());
Assert.assertEquals(EventType.INPUT_FAILED_EVENT, eventInfo.getEvents().get(0).getEventType());
}
Aggregations