use of org.apache.tez.runtime.api.events.TaskAttemptFailedEvent in project tez by apache.
the class TaskCommunicatorManager method heartbeat.
public TaskHeartbeatResponse heartbeat(TaskHeartbeatRequest request) throws IOException, TezException {
ContainerId containerId = ConverterUtils.toContainerId(request.getContainerIdentifier());
if (LOG.isDebugEnabled()) {
LOG.debug("Received heartbeat from container" + ", request=" + request);
}
if (!registeredContainers.containsKey(containerId)) {
LOG.warn("Received task heartbeat from unknown container with id: " + containerId + ", asking it to die");
return RESPONSE_SHOULD_DIE;
}
// A heartbeat can come in anytime. The AM may have made a decision to kill a running task/container
// meanwhile. If the decision is processed through the pipeline before the heartbeat is processed,
// the heartbeat will be dropped. Otherwise the heartbeat will be processed - and the system
// know how to handle this - via FailedInputEvents for example (relevant only if the heartbeat has events).
// So - avoiding synchronization.
pingContainerHeartbeatHandler(containerId);
TaskAttemptEventInfo eventInfo = new TaskAttemptEventInfo(0, null, 0);
TezTaskAttemptID taskAttemptID = request.getTaskAttemptId();
if (taskAttemptID != null) {
ContainerId containerIdFromMap = registeredAttempts.get(taskAttemptID);
if (containerIdFromMap == null || !containerIdFromMap.equals(containerId)) {
// This can happen when a task heartbeats. Meanwhile the container is unregistered.
// The information will eventually make it through to the plugin via a corresponding unregister.
// There's a race in that case between the unregister making it through, and this method returning.
// TODO TEZ-2003 (post) TEZ-2666. An exception back is likely a better approach than sending a shouldDie = true,
// so that the plugin can handle the scenario. Alternately augment the response with error codes.
// Error codes would be better than exceptions.
LOG.info("Attempt: " + taskAttemptID + " is not recognized for heartbeats");
return RESPONSE_SHOULD_DIE;
}
List<TezEvent> inEvents = request.getEvents();
if (LOG.isDebugEnabled()) {
LOG.debug("Ping from " + taskAttemptID.toString() + " events: " + (inEvents != null ? inEvents.size() : -1));
}
long currTime = context.getClock().getTime();
// taFinishedEvents - means the TaskAttemptFinishedEvent
// taGeneratedEvents - for recovery, means the events generated by this task attempt and is needed by its downstream vertices
// eventsForVertex - including all the taGeneratedEvents and other events such as INPUT_READ_ERROR_EVENT/INPUT_FAILED_EVENT
// taGeneratedEvents is routed both to TaskAttempt & Vertex. Route to Vertex is for performance consideration
// taFinishedEvents must be routed before taGeneratedEvents
List<TezEvent> taFinishedEvents = new ArrayList<TezEvent>();
List<TezEvent> taGeneratedEvents = new ArrayList<TezEvent>();
List<TezEvent> eventsForVertex = new ArrayList<TezEvent>();
TaskAttemptEventStatusUpdate taskAttemptEvent = null;
boolean readErrorReported = false;
for (TezEvent tezEvent : ListUtils.emptyIfNull(inEvents)) {
// for now, set the event time on the AM when it is received.
// this avoids any time disparity between machines.
tezEvent.setEventReceivedTime(currTime);
final EventType eventType = tezEvent.getEventType();
if (eventType == EventType.TASK_STATUS_UPDATE_EVENT) {
// send TA_STATUS_UPDATE before TA_DONE/TA_FAILED/TA_KILLED otherwise Status may be missed
taskAttemptEvent = new TaskAttemptEventStatusUpdate(taskAttemptID, (TaskStatusUpdateEvent) tezEvent.getEvent());
} else if (eventType == EventType.TASK_ATTEMPT_COMPLETED_EVENT || eventType == EventType.TASK_ATTEMPT_FAILED_EVENT || eventType == EventType.TASK_ATTEMPT_KILLED_EVENT) {
taFinishedEvents.add(tezEvent);
} else {
if (eventType == EventType.INPUT_READ_ERROR_EVENT) {
readErrorReported = true;
}
if (eventType == EventType.DATA_MOVEMENT_EVENT || eventType == EventType.COMPOSITE_DATA_MOVEMENT_EVENT || eventType == EventType.ROOT_INPUT_INITIALIZER_EVENT || eventType == EventType.VERTEX_MANAGER_EVENT) {
taGeneratedEvents.add(tezEvent);
}
eventsForVertex.add(tezEvent);
}
}
if (taskAttemptEvent != null) {
taskAttemptEvent.setReadErrorReported(readErrorReported);
sendEvent(taskAttemptEvent);
}
// route taGeneratedEvents to TaskAttempt
if (!taGeneratedEvents.isEmpty()) {
sendEvent(new TaskAttemptEventTezEventUpdate(taskAttemptID, taGeneratedEvents));
}
// route events to TaskAttempt
Preconditions.checkArgument(taFinishedEvents.size() <= 1, "Multiple TaskAttemptFinishedEvent");
for (TezEvent e : taFinishedEvents) {
EventMetaData sourceMeta = e.getSourceInfo();
switch(e.getEventType()) {
case TASK_ATTEMPT_FAILED_EVENT:
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptTerminationCause errCause = null;
switch(sourceMeta.getEventGenerator()) {
case INPUT:
errCause = TaskAttemptTerminationCause.INPUT_READ_ERROR;
break;
case PROCESSOR:
errCause = TaskAttemptTerminationCause.APPLICATION_ERROR;
break;
case OUTPUT:
errCause = TaskAttemptTerminationCause.OUTPUT_WRITE_ERROR;
break;
case SYSTEM:
errCause = TaskAttemptTerminationCause.FRAMEWORK_ERROR;
break;
default:
throw new TezUncheckedException("Unknown EventProducerConsumerType: " + sourceMeta.getEventGenerator());
}
if (e.getEventType() == EventType.TASK_ATTEMPT_FAILED_EVENT) {
TaskAttemptFailedEvent taskFailedEvent = (TaskAttemptFailedEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptFailed(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_FAILED, taskFailedEvent.getTaskFailureType(), "Error: " + taskFailedEvent.getDiagnostics(), errCause));
} else {
// Killed
TaskAttemptKilledEvent taskKilledEvent = (TaskAttemptKilledEvent) e.getEvent();
sendEvent(new TaskAttemptEventAttemptKilled(sourceMeta.getTaskAttemptID(), "Error: " + taskKilledEvent.getDiagnostics(), errCause));
}
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
sendEvent(new TaskAttemptEvent(sourceMeta.getTaskAttemptID(), TaskAttemptEventType.TA_DONE));
break;
default:
throw new TezUncheckedException("Unhandled tez event type: " + e.getEventType());
}
}
if (!eventsForVertex.isEmpty()) {
TezVertexID vertexId = taskAttemptID.getTaskID().getVertexID();
sendEvent(new VertexEventRouteEvent(vertexId, Collections.unmodifiableList(eventsForVertex)));
}
taskHeartbeatHandler.pinged(taskAttemptID);
eventInfo = context.getCurrentDAG().getVertex(taskAttemptID.getTaskID().getVertexID()).getTaskAttemptTezEvents(taskAttemptID, request.getStartIndex(), request.getPreRoutedStartIndex(), request.getMaxEvents());
}
return new TaskHeartbeatResponse(false, eventInfo.getEvents(), eventInfo.getNextFromEventId(), eventInfo.getNextPreRoutedFromEventId());
}
use of org.apache.tez.runtime.api.events.TaskAttemptFailedEvent in project tez by apache.
the class TestTaskCommunicatorManager2 method testTaskAttemptFailureViaHeartbeat.
// Tests fatal and non fatal
@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testTaskAttemptFailureViaHeartbeat() throws IOException, TezException {
TaskCommunicatorManagerWrapperForTest wrapper = new TaskCommunicatorManagerWrapperForTest();
TaskSpec taskSpec1 = wrapper.createTaskSpec();
AMContainerTask amContainerTask1 = new AMContainerTask(taskSpec1, null, null, false, 10);
TaskSpec taskSpec2 = wrapper.createTaskSpec();
AMContainerTask amContainerTask2 = new AMContainerTask(taskSpec2, null, null, false, 10);
ContainerId containerId1 = wrapper.createContainerId(1);
wrapper.registerRunningContainer(containerId1);
wrapper.registerTaskAttempt(containerId1, amContainerTask1);
ContainerId containerId2 = wrapper.createContainerId(2);
wrapper.registerRunningContainer(containerId2);
wrapper.registerTaskAttempt(containerId2, amContainerTask2);
List<TezEvent> events = new LinkedList<>();
EventMetaData sourceInfo1 = new EventMetaData(EventMetaData.EventProducerConsumerType.PROCESSOR, "testVertex", null, taskSpec1.getTaskAttemptID());
TaskAttemptFailedEvent failedEvent1 = new TaskAttemptFailedEvent("non-fatal test error", TaskFailureType.NON_FATAL);
TezEvent failedEventT1 = new TezEvent(failedEvent1, sourceInfo1);
events.add(failedEventT1);
TaskHeartbeatRequest taskHeartbeatRequest1 = new TaskHeartbeatRequest(containerId1.toString(), taskSpec1.getTaskAttemptID(), events, 0, 0, 0);
wrapper.getTaskCommunicatorManager().heartbeat(taskHeartbeatRequest1);
ArgumentCaptor<Event> argumentCaptor = ArgumentCaptor.forClass(Event.class);
verify(wrapper.getEventHandler(), times(1)).handle(argumentCaptor.capture());
assertTrue(argumentCaptor.getAllValues().get(0) instanceof TaskAttemptEventAttemptFailed);
TaskAttemptEventAttemptFailed failedEvent = (TaskAttemptEventAttemptFailed) argumentCaptor.getAllValues().get(0);
assertEquals(TaskFailureType.NON_FATAL, failedEvent.getTaskFailureType());
assertTrue(failedEvent.getDiagnosticInfo().contains("non-fatal"));
events.clear();
reset(wrapper.getEventHandler());
EventMetaData sourceInfo2 = new EventMetaData(EventMetaData.EventProducerConsumerType.PROCESSOR, "testVertex", null, taskSpec2.getTaskAttemptID());
TaskAttemptFailedEvent failedEvent2 = new TaskAttemptFailedEvent("-fatal- test error", TaskFailureType.FATAL);
TezEvent failedEventT2 = new TezEvent(failedEvent2, sourceInfo2);
events.add(failedEventT2);
TaskHeartbeatRequest taskHeartbeatRequest2 = new TaskHeartbeatRequest(containerId2.toString(), taskSpec2.getTaskAttemptID(), events, 0, 0, 0);
wrapper.getTaskCommunicatorManager().heartbeat(taskHeartbeatRequest2);
argumentCaptor = ArgumentCaptor.forClass(Event.class);
verify(wrapper.getEventHandler(), times(1)).handle(argumentCaptor.capture());
assertTrue(argumentCaptor.getAllValues().get(0) instanceof TaskAttemptEventAttemptFailed);
failedEvent = (TaskAttemptEventAttemptFailed) argumentCaptor.getAllValues().get(0);
assertEquals(TaskFailureType.FATAL, failedEvent.getTaskFailureType());
assertTrue(failedEvent.getDiagnosticInfo().contains("-fatal-"));
}
use of org.apache.tez.runtime.api.events.TaskAttemptFailedEvent in project tez by apache.
the class TezEvent method deserializeEvent.
private void deserializeEvent(DataInput in) throws IOException {
if (!in.readBoolean()) {
event = null;
return;
}
eventType = EventType.values()[in.readInt()];
eventReceivedTime = in.readLong();
if (eventType.equals(EventType.TASK_STATUS_UPDATE_EVENT)) {
// TODO NEWTEZ convert to PB
event = new TaskStatusUpdateEvent();
((TaskStatusUpdateEvent) event).readFields(in);
} else {
int eventBytesLen = in.readInt();
byte[] eventBytes;
CodedInputStream input;
int startOffset = 0;
if (in instanceof DataInputBuffer) {
eventBytes = ((DataInputBuffer) in).getData();
startOffset = ((DataInputBuffer) in).getPosition();
} else {
eventBytes = new byte[eventBytesLen];
in.readFully(eventBytes);
}
input = CodedInputStream.newInstance(eventBytes, startOffset, eventBytesLen);
switch(eventType) {
case CUSTOM_PROCESSOR_EVENT:
CustomProcessorEventProto cpProto = CustomProcessorEventProto.parseFrom(input);
event = ProtoConverters.convertCustomProcessorEventFromProto(cpProto);
break;
case DATA_MOVEMENT_EVENT:
DataMovementEventProto dmProto = DataMovementEventProto.parseFrom(input);
event = ProtoConverters.convertDataMovementEventFromProto(dmProto);
break;
case COMPOSITE_ROUTED_DATA_MOVEMENT_EVENT:
CompositeRoutedDataMovementEventProto edmProto = CompositeRoutedDataMovementEventProto.parseFrom(eventBytes);
event = ProtoConverters.convertCompositeRoutedDataMovementEventFromProto(edmProto);
break;
case COMPOSITE_DATA_MOVEMENT_EVENT:
CompositeEventProto cProto = CompositeEventProto.parseFrom(input);
event = ProtoConverters.convertCompositeDataMovementEventFromProto(cProto);
break;
case VERTEX_MANAGER_EVENT:
VertexManagerEventProto vmProto = VertexManagerEventProto.parseFrom(input);
event = ProtoConverters.convertVertexManagerEventFromProto(vmProto);
break;
case INPUT_READ_ERROR_EVENT:
InputReadErrorEventProto ideProto = InputReadErrorEventProto.parseFrom(input);
event = InputReadErrorEvent.create(ideProto.getDiagnostics(), ideProto.getIndex(), ideProto.getVersion());
break;
case TASK_ATTEMPT_FAILED_EVENT:
TaskAttemptFailedEventProto tfProto = TaskAttemptFailedEventProto.parseFrom(input);
event = new TaskAttemptFailedEvent(tfProto.getDiagnostics(), TezConverterUtils.failureTypeFromProto(tfProto.getTaskFailureType()));
break;
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptKilledEventProto tkProto = TaskAttemptKilledEventProto.parseFrom(input);
event = new TaskAttemptKilledEvent(tkProto.getDiagnostics());
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
event = new TaskAttemptCompletedEvent();
break;
case INPUT_FAILED_EVENT:
InputFailedEventProto ifProto = InputFailedEventProto.parseFrom(input);
event = InputFailedEvent.create(ifProto.getTargetIndex(), ifProto.getVersion());
break;
case ROOT_INPUT_DATA_INFORMATION_EVENT:
RootInputDataInformationEventProto difProto = RootInputDataInformationEventProto.parseFrom(input);
event = ProtoConverters.convertRootInputDataInformationEventFromProto(difProto);
break;
case ROOT_INPUT_INITIALIZER_EVENT:
EventProtos.RootInputInitializerEventProto riiProto = EventProtos.RootInputInitializerEventProto.parseFrom(input);
event = ProtoConverters.convertRootInputInitializerEventFromProto(riiProto);
break;
default:
// RootInputUpdatePayload event not wrapped in a TezEvent.
throw new TezUncheckedException("Unexpected TezEvent" + ", type=" + eventType);
}
if (in instanceof DataInputBuffer) {
// Skip so that position is updated
int skipped = in.skipBytes(eventBytesLen);
if (skipped != eventBytesLen) {
throw new TezUncheckedException("Expected to skip " + eventBytesLen + " bytes. Actually skipped = " + skipped);
}
}
}
}
use of org.apache.tez.runtime.api.events.TaskAttemptFailedEvent in project tez by apache.
the class TezEvent method serializeEvent.
private void serializeEvent(DataOutput out) throws IOException {
if (event == null) {
out.writeBoolean(false);
return;
}
out.writeBoolean(true);
out.writeInt(eventType.ordinal());
out.writeLong(eventReceivedTime);
if (eventType.equals(EventType.TASK_STATUS_UPDATE_EVENT)) {
// TODO NEWTEZ convert to PB
TaskStatusUpdateEvent sEvt = (TaskStatusUpdateEvent) event;
sEvt.write(out);
} else {
AbstractMessage message;
switch(eventType) {
case CUSTOM_PROCESSOR_EVENT:
message = ProtoConverters.convertCustomProcessorEventToProto((CustomProcessorEvent) event);
break;
case DATA_MOVEMENT_EVENT:
message = ProtoConverters.convertDataMovementEventToProto((DataMovementEvent) event);
break;
case COMPOSITE_ROUTED_DATA_MOVEMENT_EVENT:
message = ProtoConverters.convertCompositeRoutedDataMovementEventToProto((CompositeRoutedDataMovementEvent) event);
break;
case COMPOSITE_DATA_MOVEMENT_EVENT:
message = ProtoConverters.convertCompositeDataMovementEventToProto((CompositeDataMovementEvent) event);
break;
case VERTEX_MANAGER_EVENT:
message = ProtoConverters.convertVertexManagerEventToProto((VertexManagerEvent) event);
break;
case INPUT_READ_ERROR_EVENT:
InputReadErrorEvent ideEvt = (InputReadErrorEvent) event;
message = InputReadErrorEventProto.newBuilder().setIndex(ideEvt.getIndex()).setDiagnostics(ideEvt.getDiagnostics()).setVersion(ideEvt.getVersion()).build();
break;
case TASK_ATTEMPT_FAILED_EVENT:
TaskAttemptFailedEvent tfEvt = (TaskAttemptFailedEvent) event;
message = TaskAttemptFailedEventProto.newBuilder().setDiagnostics(tfEvt.getDiagnostics()).setTaskFailureType(TezConverterUtils.failureTypeToProto(tfEvt.getTaskFailureType())).build();
break;
case TASK_ATTEMPT_KILLED_EVENT:
TaskAttemptKilledEvent tkEvent = (TaskAttemptKilledEvent) event;
message = TaskAttemptKilledEventProto.newBuilder().setDiagnostics(tkEvent.getDiagnostics()).build();
break;
case TASK_ATTEMPT_COMPLETED_EVENT:
message = TaskAttemptCompletedEventProto.newBuilder().build();
break;
case INPUT_FAILED_EVENT:
InputFailedEvent ifEvt = (InputFailedEvent) event;
message = InputFailedEventProto.newBuilder().setTargetIndex(ifEvt.getTargetIndex()).setVersion(ifEvt.getVersion()).build();
break;
case ROOT_INPUT_DATA_INFORMATION_EVENT:
message = ProtoConverters.convertRootInputDataInformationEventToProto((InputDataInformationEvent) event);
break;
case ROOT_INPUT_INITIALIZER_EVENT:
message = ProtoConverters.convertRootInputInitializerEventToProto((InputInitializerEvent) event);
break;
default:
throw new TezUncheckedException("Unknown TezEvent" + ", type=" + eventType);
}
if (out instanceof OutputStream) {
// DataOutputBuffer extends DataOutputStream
int serializedSize = message.getSerializedSize();
out.writeInt(serializedSize);
int buffersize = serializedSize < CodedOutputStream.DEFAULT_BUFFER_SIZE ? serializedSize : CodedOutputStream.DEFAULT_BUFFER_SIZE;
CodedOutputStream codedOut = CodedOutputStream.newInstance((OutputStream) out, buffersize);
message.writeTo(codedOut);
codedOut.flush();
} else {
byte[] eventBytes = message.toByteArray();
out.writeInt(eventBytes.length);
out.write(eventBytes);
}
}
}
Aggregations