Search in sources :

Example 16 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class CriticalPathAnalyzer method analyzeAllocationOverhead.

private void analyzeAllocationOverhead(DagInfo dag) {
    List<TaskAttemptInfo> preemptedAttempts = Lists.newArrayList();
    for (VertexInfo v : dag.getVertices()) {
        for (TaskInfo t : v.getTasks()) {
            for (TaskAttemptInfo a : t.getTaskAttempts()) {
                if (a.getTerminationCause().equals(TaskAttemptTerminationCause.INTERNAL_PREEMPTION.name())) {
                    LOG.debug("Found preempted attempt " + a.getTaskAttemptId());
                    preemptedAttempts.add(a);
                }
            }
        }
    }
    for (int i = 0; i < criticalPath.size(); ++i) {
        CriticalPathStep step = criticalPath.get(i);
        TaskAttemptInfo attempt = step.attempt;
        if (step.getType() != EntityType.ATTEMPT) {
            continue;
        }
        long creationTime = attempt.getCreationTime();
        long allocationTime = attempt.getAllocationTime();
        long finishTime = attempt.getFinishTime();
        if (allocationTime < step.startCriticalPathTime) {
            // allocated before it became critical
            continue;
        }
        // the attempt is critical before allocation. So allocation overhead needs analysis
        Container container = attempt.getContainer();
        if (container != null) {
            Collection<TaskAttemptInfo> attempts = dag.getContainerMapping().get(container);
            if (attempts != null && !attempts.isEmpty()) {
                // arrange attempts by allocation time
                List<TaskAttemptInfo> attemptsList = Lists.newArrayList(attempts);
                Collections.sort(attemptsList, TaskAttemptInfo.orderingOnAllocationTime());
                // walk the list to record allocation time before the current attempt
                long containerPreviousAllocatedTime = 0;
                int reUsesForVertex = 1;
                for (TaskAttemptInfo containerAttempt : attemptsList) {
                    if (containerAttempt.getTaskAttemptId().equals(attempt.getTaskAttemptId())) {
                        break;
                    }
                    if (containerAttempt.getTaskInfo().getVertexInfo().getVertexId().equals(attempt.getTaskInfo().getVertexInfo().getVertexId())) {
                        // another task from the same vertex ran in this container. So there are multiple
                        // waves for this vertex on this container.
                        reUsesForVertex++;
                    }
                    long cAllocTime = containerAttempt.getAllocationTime();
                    long cFinishTime = containerAttempt.getFinishTime();
                    if (cFinishTime > creationTime) {
                        // for containerAttempts that used the container while this attempt was waiting
                        // add up time container was allocated to containerAttempt. Account for allocations
                        // that started before this attempt was created.
                        containerPreviousAllocatedTime += (cFinishTime - (cAllocTime > creationTime ? cAllocTime : creationTime));
                    }
                }
                int numVertexTasks = attempt.getTaskInfo().getVertexInfo().getNumTasks();
                int intervalMaxConcurrency = getIntervalMaxConcurrency(creationTime, finishTime);
                double numWaves = getWaves(numVertexTasks, intervalMaxConcurrency);
                if (reUsesForVertex > 1) {
                    step.notes.add("Container ran multiple tasks for this vertex. ");
                    if (numWaves < 1) {
                        // less than 1 wave total but still ran more than 1 on this container
                        step.notes.add("Vertex potentially seeing contention from other branches in the DAG. ");
                    }
                }
                if (containerPreviousAllocatedTime == 0) {
                    step.notes.add("Container newly allocated.");
                } else {
                    if (containerPreviousAllocatedTime >= attempt.getCreationToAllocationTimeInterval()) {
                        step.notes.add("Container was fully allocated");
                    } else {
                        step.notes.add("Container in use for " + SVGUtils.getTimeStr(containerPreviousAllocatedTime) + " out of " + SVGUtils.getTimeStr(attempt.getCreationToAllocationTimeInterval()) + " of allocation wait time");
                    }
                }
            }
            // look for internal preemptions while attempt was waiting for allocation
            for (TaskAttemptInfo a : preemptedAttempts) {
                if (a.getTaskInfo().getVertexInfo().getVertexId().equals(attempt.getTaskInfo().getVertexInfo().getVertexId())) {
                    // dont preempt same vertex task. ideally this should look at priority but we dont have it
                    continue;
                }
                if (a.getFinishTime() > creationTime && a.getFinishTime() < allocationTime) {
                    // found an attempt that was preempted within this time interval
                    step.notes.add("Potentially waited for preemption of " + a.getShortName());
                }
            }
        }
    }
}
Also used : TaskInfo(org.apache.tez.history.parser.datamodel.TaskInfo) Container(org.apache.tez.history.parser.datamodel.Container) TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo)

Example 17 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class SpillAnalyzerImpl method analyze.

@Override
public void analyze(DagInfo dagInfo) throws TezException {
    for (VertexInfo vertexInfo : dagInfo.getVertices()) {
        String vertexName = vertexInfo.getVertexName();
        for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
            // Get ADDITIONAL_SPILL_COUNT, OUTPUT_BYTES for every source
            Map<String, TezCounter> spillCountMap = attemptInfo.getCounter(TaskCounter.ADDITIONAL_SPILL_COUNT.name());
            Map<String, TezCounter> spilledRecordsMap = attemptInfo.getCounter(TaskCounter.SPILLED_RECORDS.name());
            Map<String, TezCounter> outputRecordsMap = attemptInfo.getCounter(TaskCounter.OUTPUT_RECORDS.name());
            Map<String, TezCounter> outputBytesMap = attemptInfo.getCounter(TaskCounter.OUTPUT_BYTES.name());
            for (Map.Entry<String, TezCounter> entry : spillCountMap.entrySet()) {
                String source = entry.getKey();
                long spillCount = entry.getValue().getValue();
                long outBytes = outputBytesMap.get(source).getValue();
                long outputRecords = outputRecordsMap.get(source).getValue();
                long spilledRecords = spilledRecordsMap.get(source).getValue();
                if (spillCount > 1 && outBytes > minOutputBytesPerTask) {
                    List<String> recorList = Lists.newLinkedList();
                    recorList.add(vertexName);
                    recorList.add(attemptInfo.getTaskAttemptId());
                    recorList.add(attemptInfo.getNodeId());
                    recorList.add(source);
                    recorList.add(spillCount + "");
                    recorList.add(attemptInfo.getTimeTaken() + "");
                    recorList.add(outBytes + "");
                    recorList.add(outputRecords + "");
                    recorList.add(spilledRecords + "");
                    recorList.add("Consider increasing " + TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB + ". Try increasing container size.");
                    csvResult.addRecord(recorList.toArray(new String[recorList.size()]));
                }
            }
        }
    }
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo) TezCounter(org.apache.tez.common.counters.TezCounter) Map(java.util.Map)

Example 18 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class SVGUtils method drawStep.

public void drawStep(CriticalPathStep step, long dagStartTime, int yOffset) {
    if (step.getType() != EntityType.ATTEMPT) {
        // draw initial vertex or final commit overhead
        StringBuilder title = new StringBuilder();
        String text = null;
        if (step.getType() == EntityType.VERTEX_INIT) {
            String vertex = step.getAttempt().getTaskInfo().getVertexInfo().getVertexName();
            text = vertex + " : Init";
            title.append(text).append(TITLE_BR);
        } else {
            text = "Output Commit";
            title.append(text).append(TITLE_BR);
        }
        title.append("Critical Path Dependency: " + step.getReason()).append(TITLE_BR);
        title.append("Critical Time: " + getTimeStr(step.getStopCriticalTime() - step.getStartCriticalTime())).append("");
        title.append(Joiner.on(TITLE_BR).join(step.getNotes()));
        String titleStr = title.toString();
        int stopTimeInterval = (int) (step.getStopCriticalTime() - dagStartTime);
        int startTimeInterval = (int) (step.getStartCriticalTime() - dagStartTime);
        addRectStr(startTimeInterval, (stopTimeInterval - startTimeInterval), yOffset * STEP_GAP, STEP_GAP, VERTEX_INIT_COMMIT_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
        addTextStr((stopTimeInterval + startTimeInterval) / 2, (yOffset * STEP_GAP + STEP_GAP / 2), text, "middle", TEXT_SIZE, titleStr, false);
    } else {
        TaskAttemptInfo attempt = step.getAttempt();
        int startCriticalTimeInterval = (int) (step.getStartCriticalTime() - dagStartTime);
        int stopCriticalTimeInterval = (int) (step.getStopCriticalTime() - dagStartTime);
        int creationTimeInterval = (int) (attempt.getCreationTime() - dagStartTime);
        int allocationTimeInterval = attempt.getAllocationTime() > 0 ? (int) (attempt.getAllocationTime() - dagStartTime) : 0;
        int launchTimeInterval = attempt.getStartTime() > 0 ? (int) (attempt.getStartTime() - dagStartTime) : 0;
        int finishTimeInterval = (int) (attempt.getFinishTime() - dagStartTime);
        LOG.debug(attempt.getTaskAttemptId() + " " + creationTimeInterval + " " + allocationTimeInterval + " " + launchTimeInterval + " " + finishTimeInterval);
        StringBuilder title = new StringBuilder();
        title.append("Attempt: " + attempt.getTaskAttemptId()).append(TITLE_BR);
        title.append("Critical Path Dependency: " + step.getReason()).append(TITLE_BR);
        title.append("Completion Status: " + attempt.getDetailedStatus()).append(TITLE_BR);
        title.append("Critical Time Contribution: " + getTimeStr(step.getStopCriticalTime() - step.getStartCriticalTime())).append(TITLE_BR);
        title.append("Critical start at: " + getTimeStr(startCriticalTimeInterval)).append(TITLE_BR);
        title.append("Critical stop at: " + getTimeStr(stopCriticalTimeInterval)).append(TITLE_BR);
        title.append("Created at: " + getTimeStr(creationTimeInterval)).append(TITLE_BR);
        if (allocationTimeInterval > 0) {
            title.append("Allocated at: " + getTimeStr(allocationTimeInterval)).append(TITLE_BR);
        }
        if (launchTimeInterval > 0) {
            title.append("Launched at: " + getTimeStr(launchTimeInterval)).append(TITLE_BR);
        }
        title.append("Finished at: " + getTimeStr(finishTimeInterval)).append(TITLE_BR);
        title.append(Joiner.on(TITLE_BR).join(step.getNotes()));
        String titleStr = title.toString();
        // handle cases when attempt fails before allocation or launch
        if (allocationTimeInterval > 0) {
            addRectStr(creationTimeInterval, allocationTimeInterval - creationTimeInterval, yOffset * STEP_GAP, STEP_GAP, ALLOCATION_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
            if (launchTimeInterval > 0) {
                addRectStr(allocationTimeInterval, launchTimeInterval - allocationTimeInterval, yOffset * STEP_GAP, STEP_GAP, LAUNCH_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
                addRectStr(launchTimeInterval, finishTimeInterval - launchTimeInterval, yOffset * STEP_GAP, STEP_GAP, RUNTIME_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
            } else {
                // no launch - so allocate to finish drawn - ended while launching
                addRectStr(allocationTimeInterval, finishTimeInterval - allocationTimeInterval, yOffset * STEP_GAP, STEP_GAP, LAUNCH_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
            }
        } else {
            // no allocation - so create to finish drawn - ended while allocating
            addRectStr(creationTimeInterval, finishTimeInterval - creationTimeInterval, yOffset * STEP_GAP, STEP_GAP, ALLOCATION_OVERHEAD_COLOR, BORDER_COLOR, RECT_OPACITY, titleStr);
        }
        addTextStr((finishTimeInterval + creationTimeInterval) / 2, (yOffset * STEP_GAP + STEP_GAP / 2), attempt.getShortName(), "middle", TEXT_SIZE, titleStr, !attempt.isSucceeded());
    }
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo)

Example 19 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class LocalityAnalyzer method computeAverages.

/**
 * Compute counter averages for specific vertex
 *
 * @param vertexInfo
 * @param counter
 * @return task attempt details
 */
private TaskAttemptDetails computeAverages(VertexInfo vertexInfo, DAGCounter counter) {
    long totalTime = 0;
    long totalTasks = 0;
    long totalHDFSBytesRead = 0;
    TaskAttemptDetails result = new TaskAttemptDetails();
    for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
        Map<String, TezCounter> localityCounter = attemptInfo.getCounter(DAGCounter.class.getName(), counter.toString());
        if (!localityCounter.isEmpty() && localityCounter.get(DAGCounter.class.getName()).getValue() > 0) {
            totalTime += attemptInfo.getTimeTaken();
            totalTasks++;
            // get HDFSBytes read counter
            Map<String, TezCounter> hdfsBytesReadCounter = attemptInfo.getCounter(FileSystemCounter.class.getName(), FileSystemCounter.HDFS_BYTES_READ.name());
            for (Map.Entry<String, TezCounter> entry : hdfsBytesReadCounter.entrySet()) {
                totalHDFSBytesRead += entry.getValue().getValue();
            }
        }
    }
    if (totalTasks > 0) {
        result.avgRuntime = (totalTime * 1.0f / totalTasks);
        result.avgHDFSBytesRead = (totalHDFSBytesRead * 1.0f / totalTasks);
    }
    return result;
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) DAGCounter(org.apache.tez.common.counters.DAGCounter) TezCounter(org.apache.tez.common.counters.TezCounter) FileSystemCounter(org.apache.tez.common.counters.FileSystemCounter) Map(java.util.Map)

Example 20 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class ShuffleTimeAnalyzer method analyze.

@Override
public void analyze(DagInfo dagInfo) throws TezException {
    for (VertexInfo vertexInfo : dagInfo.getVertices()) {
        for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
            // counter_group (basically source) --> counter
            Map<String, TezCounter> reduceInputGroups = attemptInfo.getCounter(TaskCounter.REDUCE_INPUT_GROUPS.toString());
            Map<String, TezCounter> reduceInputRecords = attemptInfo.getCounter(TaskCounter.REDUCE_INPUT_RECORDS.toString());
            if (reduceInputGroups == null) {
                continue;
            }
            for (Map.Entry<String, TezCounter> entry : reduceInputGroups.entrySet()) {
                String counterGroupName = entry.getKey();
                long reduceInputGroupsVal = entry.getValue().getValue();
                long reduceInputRecordsVal = (reduceInputRecords.get(counterGroupName) != null) ? reduceInputRecords.get(counterGroupName).getValue() : 0;
                if (reduceInputRecordsVal <= 0) {
                    continue;
                }
                float ratio = (reduceInputGroupsVal * 1.0f / reduceInputRecordsVal);
                if (ratio > 0 && reduceInputRecordsVal > minShuffleRecords) {
                    List<String> result = Lists.newLinkedList();
                    result.add(vertexInfo.getVertexName());
                    result.add(attemptInfo.getTaskAttemptId());
                    result.add(attemptInfo.getNodeId());
                    result.add(counterGroupName);
                    // Real work done in the task
                    String comments = "";
                    String mergePhaseTime = getCounterValue(TaskCounter.MERGE_PHASE_TIME, counterGroupName, attemptInfo);
                    String timeTakenForRealWork = "";
                    if (!Strings.isNullOrEmpty(mergePhaseTime)) {
                        long realWorkDone = attemptInfo.getTimeTaken() - Long.parseLong(mergePhaseTime);
                        if ((realWorkDone * 1.0f / attemptInfo.getTimeTaken()) < realWorkDoneRatio) {
                            comments = "Time taken in shuffle is more than the actual work being done in task. " + " Check if source/destination machine is a slow node. Check if merge phase " + "time is more to understand disk bottlenecks in this node.  Check for skew";
                        }
                        timeTakenForRealWork = Long.toString(realWorkDone);
                    }
                    result.add(comments);
                    result.add(reduceInputGroupsVal + "");
                    result.add(reduceInputRecordsVal + "");
                    result.add("" + (1.0f * reduceInputGroupsVal / reduceInputRecordsVal));
                    result.add(getCounterValue(TaskCounter.SHUFFLE_BYTES, counterGroupName, attemptInfo));
                    result.add(Long.toString(attemptInfo.getTimeTaken()));
                    // Total time taken for receiving all events from source tasks
                    result.add(getOverheadFromSourceTasks(counterGroupName, attemptInfo));
                    result.add(getCounterValue(TaskCounter.MERGE_PHASE_TIME, counterGroupName, attemptInfo));
                    result.add(getCounterValue(TaskCounter.SHUFFLE_PHASE_TIME, counterGroupName, attemptInfo));
                    result.add(timeTakenForRealWork);
                    result.add(getCounterValue(TaskCounter.FIRST_EVENT_RECEIVED, counterGroupName, attemptInfo));
                    result.add(getCounterValue(TaskCounter.LAST_EVENT_RECEIVED, counterGroupName, attemptInfo));
                    result.add(getCounterValue(TaskCounter.SHUFFLE_BYTES_DISK_DIRECT, counterGroupName, attemptInfo));
                    csvResult.addRecord(result.toArray(new String[result.size()]));
                }
            }
        }
    }
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo) TezCounter(org.apache.tez.common.counters.TezCounter) Map(java.util.Map)

Aggregations

TaskAttemptInfo (org.apache.tez.history.parser.datamodel.TaskAttemptInfo)23 VertexInfo (org.apache.tez.history.parser.datamodel.VertexInfo)15 TaskInfo (org.apache.tez.history.parser.datamodel.TaskInfo)6 Map (java.util.Map)5 TezCounter (org.apache.tez.common.counters.TezCounter)5 Container (org.apache.tez.history.parser.datamodel.Container)2 DataDependencyEvent (org.apache.tez.history.parser.datamodel.TaskAttemptInfo.DataDependencyEvent)2 HashMap (java.util.HashMap)1 Scanner (java.util.Scanner)1 DAGCounter (org.apache.tez.common.counters.DAGCounter)1 FileSystemCounter (org.apache.tez.common.counters.FileSystemCounter)1 TaskCounter (org.apache.tez.common.counters.TaskCounter)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TezException (org.apache.tez.dag.api.TezException)1 TezDAGID (org.apache.tez.dag.records.TezDAGID)1 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)1 TezTaskID (org.apache.tez.dag.records.TezTaskID)1 TezVertexID (org.apache.tez.dag.records.TezVertexID)1 WordCount (org.apache.tez.examples.WordCount)1 DagInfo (org.apache.tez.history.parser.datamodel.DagInfo)1