use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class CriticalPathAnalyzer method analyzeAllocationOverhead.
private void analyzeAllocationOverhead(DagInfo dag) {
List<TaskAttemptInfo> preemptedAttempts = Lists.newArrayList();
for (VertexInfo v : dag.getVertices()) {
for (TaskInfo t : v.getTasks()) {
for (TaskAttemptInfo a : t.getTaskAttempts()) {
if (a.getTerminationCause().equals(TaskAttemptTerminationCause.INTERNAL_PREEMPTION.name())) {
LOG.debug("Found preempted attempt " + a.getTaskAttemptId());
preemptedAttempts.add(a);
}
}
}
}
for (int i = 0; i < criticalPath.size(); ++i) {
CriticalPathStep step = criticalPath.get(i);
TaskAttemptInfo attempt = step.attempt;
if (step.getType() != EntityType.ATTEMPT) {
continue;
}
long creationTime = attempt.getCreationTime();
long allocationTime = attempt.getAllocationTime();
long finishTime = attempt.getFinishTime();
if (allocationTime < step.startCriticalPathTime) {
// allocated before it became critical
continue;
}
// the attempt is critical before allocation. So allocation overhead needs analysis
Container container = attempt.getContainer();
if (container != null) {
Collection<TaskAttemptInfo> attempts = dag.getContainerMapping().get(container);
if (attempts != null && !attempts.isEmpty()) {
// arrange attempts by allocation time
List<TaskAttemptInfo> attemptsList = Lists.newArrayList(attempts);
Collections.sort(attemptsList, TaskAttemptInfo.orderingOnAllocationTime());
// walk the list to record allocation time before the current attempt
long containerPreviousAllocatedTime = 0;
int reUsesForVertex = 1;
for (TaskAttemptInfo containerAttempt : attemptsList) {
if (containerAttempt.getTaskAttemptId().equals(attempt.getTaskAttemptId())) {
break;
}
if (containerAttempt.getTaskInfo().getVertexInfo().getVertexId().equals(attempt.getTaskInfo().getVertexInfo().getVertexId())) {
// another task from the same vertex ran in this container. So there are multiple
// waves for this vertex on this container.
reUsesForVertex++;
}
long cAllocTime = containerAttempt.getAllocationTime();
long cFinishTime = containerAttempt.getFinishTime();
if (cFinishTime > creationTime) {
// for containerAttempts that used the container while this attempt was waiting
// add up time container was allocated to containerAttempt. Account for allocations
// that started before this attempt was created.
containerPreviousAllocatedTime += (cFinishTime - (cAllocTime > creationTime ? cAllocTime : creationTime));
}
}
int numVertexTasks = attempt.getTaskInfo().getVertexInfo().getNumTasks();
int intervalMaxConcurrency = getIntervalMaxConcurrency(creationTime, finishTime);
double numWaves = getWaves(numVertexTasks, intervalMaxConcurrency);
if (reUsesForVertex > 1) {
step.notes.add("Container ran multiple tasks for this vertex. ");
if (numWaves < 1) {
// less than 1 wave total but still ran more than 1 on this container
step.notes.add("Vertex potentially seeing contention from other branches in the DAG. ");
}
}
if (containerPreviousAllocatedTime == 0) {
step.notes.add("Container newly allocated.");
} else {
if (containerPreviousAllocatedTime >= attempt.getCreationToAllocationTimeInterval()) {
step.notes.add("Container was fully allocated");
} else {
step.notes.add("Container in use for " + SVGUtils.getTimeStr(containerPreviousAllocatedTime) + " out of " + SVGUtils.getTimeStr(attempt.getCreationToAllocationTimeInterval()) + " of allocation wait time");
}
}
}
// look for internal preemptions while attempt was waiting for allocation
for (TaskAttemptInfo a : preemptedAttempts) {
if (a.getTaskInfo().getVertexInfo().getVertexId().equals(attempt.getTaskInfo().getVertexInfo().getVertexId())) {
// dont preempt same vertex task. ideally this should look at priority but we dont have it
continue;
}
if (a.getFinishTime() > creationTime && a.getFinishTime() < allocationTime) {
// found an attempt that was preempted within this time interval
step.notes.add("Potentially waited for preemption of " + a.getShortName());
}
}
}
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class SpillAnalyzerImpl method analyze.
@Override
public void analyze(DagInfo dagInfo) throws TezException {
for (VertexInfo vertexInfo : dagInfo.getVertices()) {
String vertexName = vertexInfo.getVertexName();
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
// Get ADDITIONAL_SPILL_COUNT, OUTPUT_BYTES for every source
Map<String, TezCounter> spillCountMap = attemptInfo.getCounter(TaskCounter.ADDITIONAL_SPILL_COUNT.name());
Map<String, TezCounter> spilledRecordsMap = attemptInfo.getCounter(TaskCounter.SPILLED_RECORDS.name());
Map<String, TezCounter> outputRecordsMap = attemptInfo.getCounter(TaskCounter.OUTPUT_RECORDS.name());
Map<String, TezCounter> outputBytesMap = attemptInfo.getCounter(TaskCounter.OUTPUT_BYTES.name());
for (Map.Entry<String, TezCounter> entry : spillCountMap.entrySet()) {
String source = entry.getKey();
long spillCount = entry.getValue().getValue();
long outBytes = outputBytesMap.get(source).getValue();
long outputRecords = outputRecordsMap.get(source).getValue();
long spilledRecords = spilledRecordsMap.get(source).getValue();
if (spillCount > 1 && outBytes > minOutputBytesPerTask) {
List<String> recorList = Lists.newLinkedList();
recorList.add(vertexName);
recorList.add(attemptInfo.getTaskAttemptId());
recorList.add(attemptInfo.getNodeId());
recorList.add(source);
recorList.add(spillCount + "");
recorList.add(attemptInfo.getTimeTaken() + "");
recorList.add(outBytes + "");
recorList.add(outputRecords + "");
recorList.add(spilledRecords + "");
recorList.add("Consider increasing " + TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + ". Try increasing container size.");
csvResult.addRecord(recorList.toArray(new String[recorList.size()]));
}
}
}
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class SVGUtils method drawStep.
public void drawStep(CriticalPathStep step, long dagStartTime, int yOffset) {
if (step.getType() != EntityType.ATTEMPT) {
// draw initial vertex or final commit overhead
StringBuilder title = new StringBuilder();
String text = null;
if (step.getType() == EntityType.VERTEX_INIT) {
String vertex = step.getAttempt().getTaskInfo().getVertexInfo().getVertexName();
text = vertex + " : Init";
title.append(text).append(TITLE_BR);
} else {
text = "Output Commit";
title.append(text).append(TITLE_BR);
}
title.append("Critical Path Dependency: " + step.getReason()).append(TITLE_BR);
title.append("Critical Time: " + getTimeStr(step.getStopCriticalTime() - step.getStartCriticalTime())).append("");
title.append(Joiner.on(TITLE_BR).join(step.getNotes()));
String titleStr = title.toString();
int stopTimeInterval = (int) (step.getStopCriticalTime() - dagStartTime);
int startTimeInterval = (int) (step.getStartCriticalTime() - dagStartTime);
addRectStr(startTimeInterval, (stopTimeInterval - startTimeInterval), yOffset * STEP_GAP, STEP_GAP, VERTEX_INIT_COMMIT_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
addTextStr((stopTimeInterval + startTimeInterval) / 2, (yOffset * STEP_GAP + STEP_GAP / 2), text, "middle", TEXT_SIZE, titleStr, false);
} else {
TaskAttemptInfo attempt = step.getAttempt();
int startCriticalTimeInterval = (int) (step.getStartCriticalTime() - dagStartTime);
int stopCriticalTimeInterval = (int) (step.getStopCriticalTime() - dagStartTime);
int creationTimeInterval = (int) (attempt.getCreationTime() - dagStartTime);
int allocationTimeInterval = attempt.getAllocationTime() > 0 ? (int) (attempt.getAllocationTime() - dagStartTime) : 0;
int launchTimeInterval = attempt.getStartTime() > 0 ? (int) (attempt.getStartTime() - dagStartTime) : 0;
int finishTimeInterval = (int) (attempt.getFinishTime() - dagStartTime);
LOG.debug(attempt.getTaskAttemptId() + " " + creationTimeInterval + " " + allocationTimeInterval + " " + launchTimeInterval + " " + finishTimeInterval);
StringBuilder title = new StringBuilder();
title.append("Attempt: " + attempt.getTaskAttemptId()).append(TITLE_BR);
title.append("Critical Path Dependency: " + step.getReason()).append(TITLE_BR);
title.append("Completion Status: " + attempt.getDetailedStatus()).append(TITLE_BR);
title.append("Critical Time Contribution: " + getTimeStr(step.getStopCriticalTime() - step.getStartCriticalTime())).append(TITLE_BR);
title.append("Critical start at: " + getTimeStr(startCriticalTimeInterval)).append(TITLE_BR);
title.append("Critical stop at: " + getTimeStr(stopCriticalTimeInterval)).append(TITLE_BR);
title.append("Created at: " + getTimeStr(creationTimeInterval)).append(TITLE_BR);
if (allocationTimeInterval > 0) {
title.append("Allocated at: " + getTimeStr(allocationTimeInterval)).append(TITLE_BR);
}
if (launchTimeInterval > 0) {
title.append("Launched at: " + getTimeStr(launchTimeInterval)).append(TITLE_BR);
}
title.append("Finished at: " + getTimeStr(finishTimeInterval)).append(TITLE_BR);
title.append(Joiner.on(TITLE_BR).join(step.getNotes()));
String titleStr = title.toString();
// handle cases when attempt fails before allocation or launch
if (allocationTimeInterval > 0) {
addRectStr(creationTimeInterval, allocationTimeInterval - creationTimeInterval, yOffset * STEP_GAP, STEP_GAP, ALLOCATION_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
if (launchTimeInterval > 0) {
addRectStr(allocationTimeInterval, launchTimeInterval - allocationTimeInterval, yOffset * STEP_GAP, STEP_GAP, LAUNCH_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
addRectStr(launchTimeInterval, finishTimeInterval - launchTimeInterval, yOffset * STEP_GAP, STEP_GAP, RUNTIME_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
} else {
// no launch - so allocate to finish drawn - ended while launching
addRectStr(allocationTimeInterval, finishTimeInterval - allocationTimeInterval, yOffset * STEP_GAP, STEP_GAP, LAUNCH_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
}
} else {
// no allocation - so create to finish drawn - ended while allocating
addRectStr(creationTimeInterval, finishTimeInterval - creationTimeInterval, yOffset * STEP_GAP, STEP_GAP, ALLOCATION_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
}
addTextStr((finishTimeInterval + creationTimeInterval) / 2, (yOffset * STEP_GAP + STEP_GAP / 2), attempt.getShortName(), "middle", TEXT_SIZE, titleStr, !attempt.isSucceeded());
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class LocalityAnalyzer method computeAverages.
/**
* Compute counter averages for specific vertex
*
* @param vertexInfo
* @param counter
* @return task attempt details
*/
private TaskAttemptDetails computeAverages(VertexInfo vertexInfo, DAGCounter counter) {
long totalTime = 0;
long totalTasks = 0;
long totalHDFSBytesRead = 0;
TaskAttemptDetails result = new TaskAttemptDetails();
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
Map<String, TezCounter> localityCounter = attemptInfo.getCounter(DAGCounter.class.getName(), counter.toString());
if (!localityCounter.isEmpty() && localityCounter.get(DAGCounter.class.getName()).getValue() > 0) {
totalTime += attemptInfo.getTimeTaken();
totalTasks++;
// get HDFSBytes read counter
Map<String, TezCounter> hdfsBytesReadCounter = attemptInfo.getCounter(FileSystemCounter.class.getName(), FileSystemCounter.HDFS_BYTES_READ.name());
for (Map.Entry<String, TezCounter> entry : hdfsBytesReadCounter.entrySet()) {
totalHDFSBytesRead += entry.getValue().getValue();
}
}
}
if (totalTasks > 0) {
result.avgRuntime = (totalTime * 1.0f / totalTasks);
result.avgHDFSBytesRead = (totalHDFSBytesRead * 1.0f / totalTasks);
}
return result;
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class ShuffleTimeAnalyzer method analyze.
@Override
public void analyze(DagInfo dagInfo) throws TezException {
for (VertexInfo vertexInfo : dagInfo.getVertices()) {
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
// counter_group (basically source) --> counter
Map<String, TezCounter> reduceInputGroups = attemptInfo.getCounter(TaskCounter.REDUCE_INPUT_GROUPS.toString());
Map<String, TezCounter> reduceInputRecords = attemptInfo.getCounter(TaskCounter.REDUCE_INPUT_RECORDS.toString());
if (reduceInputGroups == null) {
continue;
}
for (Map.Entry<String, TezCounter> entry : reduceInputGroups.entrySet()) {
String counterGroupName = entry.getKey();
long reduceInputGroupsVal = entry.getValue().getValue();
long reduceInputRecordsVal = (reduceInputRecords.get(counterGroupName) != null) ? reduceInputRecords.get(counterGroupName).getValue() : 0;
if (reduceInputRecordsVal <= 0) {
continue;
}
float ratio = (reduceInputGroupsVal * 1.0f / reduceInputRecordsVal);
if (ratio > 0 && reduceInputRecordsVal > minShuffleRecords) {
List<String> result = Lists.newLinkedList();
result.add(vertexInfo.getVertexName());
result.add(attemptInfo.getTaskAttemptId());
result.add(attemptInfo.getNodeId());
result.add(counterGroupName);
// Real work done in the task
String comments = "";
String mergePhaseTime = getCounterValue(TaskCounter.MERGE_PHASE_TIME, counterGroupName, attemptInfo);
String timeTakenForRealWork = "";
if (!Strings.isNullOrEmpty(mergePhaseTime)) {
long realWorkDone = attemptInfo.getTimeTaken() - Long.parseLong(mergePhaseTime);
if ((realWorkDone * 1.0f / attemptInfo.getTimeTaken()) < realWorkDoneRatio) {
comments = "Time taken in shuffle is more than the actual work being done in task. " + " Check if source/destination machine is a slow node. Check if merge phase " + "time is more to understand disk bottlenecks in this node. Check for skew";
}
timeTakenForRealWork = Long.toString(realWorkDone);
}
result.add(comments);
result.add(reduceInputGroupsVal + "");
result.add(reduceInputRecordsVal + "");
result.add("" + (1.0f * reduceInputGroupsVal / reduceInputRecordsVal));
result.add(getCounterValue(TaskCounter.SHUFFLE_BYTES, counterGroupName, attemptInfo));
result.add(Long.toString(attemptInfo.getTimeTaken()));
// Total time taken for receiving all events from source tasks
result.add(getOverheadFromSourceTasks(counterGroupName, attemptInfo));
result.add(getCounterValue(TaskCounter.MERGE_PHASE_TIME, counterGroupName, attemptInfo));
result.add(getCounterValue(TaskCounter.SHUFFLE_PHASE_TIME, counterGroupName, attemptInfo));
result.add(timeTakenForRealWork);
result.add(getCounterValue(TaskCounter.FIRST_EVENT_RECEIVED, counterGroupName, attemptInfo));
result.add(getCounterValue(TaskCounter.LAST_EVENT_RECEIVED, counterGroupName, attemptInfo));
result.add(getCounterValue(TaskCounter.SHUFFLE_BYTES_DISK_DIRECT, counterGroupName, attemptInfo));
csvResult.addRecord(result.toArray(new String[result.size()]));
}
}
}
}
}
Aggregations