Search in sources :

Example 6 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class SlowestVertexAnalyzer method analyze.

@Override
public void analyze(DagInfo dagInfo) throws TezException {
    for (VertexInfo vertexInfo : dagInfo.getVertices()) {
        String vertexName = vertexInfo.getVertexName();
        if (vertexInfo.getFirstTaskToStart() == null || vertexInfo.getLastTaskToFinish() == null) {
            continue;
        }
        long totalTime = getTaskRuntime(vertexInfo);
        long slowestLastEventTime = Long.MIN_VALUE;
        String maxSourceName = "";
        taskAttemptRuntimeHistorgram = metrics.histogram(vertexName);
        for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
            taskAttemptRuntimeHistorgram.update(attemptInfo.getTimeTaken());
            // Get the last event received from the incoming vertices
            Map<String, TezCounter> lastEventReceivedMap = attemptInfo.getCounter(TaskCounter.LAST_EVENT_RECEIVED.toString());
            for (Map.Entry<String, TezCounter> entry : lastEventReceivedMap.entrySet()) {
                if (entry.getKey().equals(TaskCounter.class.getName())) {
                    // getting TaskCounter details as well.
                    continue;
                }
                // Find the slowest last event received
                if (entry.getValue().getValue() > slowestLastEventTime) {
                    slowestLastEventTime = entry.getValue().getValue();
                    maxSourceName = entry.getKey();
                }
            }
        }
        long shuffleMax = Long.MIN_VALUE;
        String shuffleMaxSource = "";
        for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
            // Get the last event received from the incoming vertices
            Map<String, TezCounter> lastEventReceivedMap = attemptInfo.getCounter(TaskCounter.SHUFFLE_PHASE_TIME.toString());
            for (Map.Entry<String, TezCounter> entry : lastEventReceivedMap.entrySet()) {
                if (entry.getKey().equals(TaskCounter.class.getName())) {
                    // ignore. TODO: hack for taskcounter issue
                    continue;
                }
                // Find the slowest last event received
                if (entry.getValue().getValue() > shuffleMax) {
                    shuffleMax = entry.getValue().getValue();
                    shuffleMaxSource = entry.getKey();
                }
            }
        }
        String comments = "";
        List<String> record = Lists.newLinkedList();
        record.add(vertexName);
        record.add(vertexInfo.getTaskAttempts().size() + "");
        record.add(totalTime + "");
        record.add(Math.max(0, shuffleMax) + "");
        record.add(shuffleMaxSource);
        record.add(Math.max(0, slowestLastEventTime) + "");
        record.add(maxSourceName);
        // Finding out real_work done at vertex level might be meaningless (as it is quite posisble
        // that it went to starvation).
        StringBuilder sb = new StringBuilder();
        double percentile75 = taskAttemptRuntimeHistorgram.getSnapshot().get75thPercentile();
        double percentile95 = taskAttemptRuntimeHistorgram.getSnapshot().get95thPercentile();
        double percentile98 = taskAttemptRuntimeHistorgram.getSnapshot().get98thPercentile();
        double percentile99 = taskAttemptRuntimeHistorgram.getSnapshot().get99thPercentile();
        double medianAttemptRuntime = taskAttemptRuntimeHistorgram.getSnapshot().getMedian();
        record.add("75th=" + percentile75);
        record.add("95th=" + percentile95);
        record.add("98th=" + percentile98);
        record.add("median=" + medianAttemptRuntime);
        if (percentile75 / percentile99 < 0.5) {
            // looks like some straggler task is there.
            sb.append("Looks like some straggler task is there");
        }
        record.add(sb.toString());
        if (totalTime > 0 && vertexInfo.getTaskAttempts().size() > 0) {
            if ((shuffleMax * 1.0f / totalTime) > 0.5) {
                if ((slowestLastEventTime * 1.0f / totalTime) > 0.5) {
                    comments = "This vertex is slow due to its dependency on parent. Got a lot delayed last" + " event received";
                } else {
                    comments = "Spending too much time on shuffle. Check shuffle bytes from previous vertex";
                }
            } else {
                if (totalTime > vertexRuntimeThreshold) {
                    // greater than X seconds.
                    comments = "Concentrate on this vertex (totalTime > " + vertexRuntimeThreshold + " seconds)";
                }
            }
        }
        record.add(comments);
        csvResult.addRecord(record.toArray(new String[record.size()]));
    }
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo) TezCounter(org.apache.tez.common.counters.TezCounter) Map(java.util.Map) TaskCounter(org.apache.tez.common.counters.TaskCounter)

Example 7 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class CriticalPathAnalyzer method analyze.

@Override
public void analyze(DagInfo dagInfo) throws TezException {
    // get all attempts in the dag and find the last failed/succeeded attempt.
    // ignore killed attempt to handle kills that happen upon dag completion
    TaskAttemptInfo lastAttempt = null;
    long lastAttemptFinishTime = 0;
    for (VertexInfo vertex : dagInfo.getVertices()) {
        for (TaskInfo task : vertex.getTasks()) {
            for (TaskAttemptInfo attempt : task.getTaskAttempts()) {
                attempts.put(attempt.getTaskAttemptId(), attempt);
                if (attempt.getStatus().equals(succeededState) || attempt.getStatus().equals(failedState)) {
                    if (lastAttemptFinishTime < attempt.getFinishTime()) {
                        lastAttempt = attempt;
                        lastAttemptFinishTime = attempt.getFinishTime();
                    }
                }
            }
        }
    }
    if (lastAttempt == null) {
        LOG.info("Cannot find last attempt to finish in DAG " + dagInfo.getDagId());
        return;
    }
    createCriticalPath(dagInfo, lastAttempt, lastAttemptFinishTime, attempts);
    analyzeCriticalPath(dagInfo);
    if (getConf().getBoolean(DRAW_SVG, true)) {
        saveCriticalPathAsSVG(dagInfo);
    }
}
Also used : TaskInfo(org.apache.tez.history.parser.datamodel.TaskInfo) TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo)

Example 8 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class CriticalPathAnalyzer method analyzeStragglers.

private void analyzeStragglers(DagInfo dag) {
    long dagStartTime = dag.getStartTime();
    long dagTime = dag.getFinishTime() - dagStartTime;
    long totalAttemptCriticalTime = 0;
    for (int i = 0; i < criticalPath.size(); ++i) {
        CriticalPathStep step = criticalPath.get(i);
        totalAttemptCriticalTime += (step.stopCriticalPathTime - step.startCriticalPathTime);
        TaskAttemptInfo attempt = step.attempt;
        if (step.getType() == EntityType.ATTEMPT) {
            // analyze execution overhead
            if (attempt.getLastDataEvents().size() > 1) {
                // there were read errors. that could have delayed the attempt. ignore this
                continue;
            }
            long avgPostDataExecutionTime = attempt.getTaskInfo().getVertexInfo().getAvgPostDataExecutionTimeInterval();
            if (avgPostDataExecutionTime <= 0) {
                continue;
            }
            long attemptExecTime = attempt.getPostDataExecutionTimeInterval();
            if (avgPostDataExecutionTime * 1.25 < attemptExecTime) {
                step.notes.add("Potential straggler. Post Data Execution time " + SVGUtils.getTimeStr(attemptExecTime) + " compared to vertex average of " + SVGUtils.getTimeStr(avgPostDataExecutionTime));
            }
        }
    }
    LOG.debug("DAG time taken: " + dagTime + " TotalAttemptTime: " + totalAttemptCriticalTime + " DAG finish time: " + dag.getFinishTime() + " DAG start time: " + dagStartTime);
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo)

Example 9 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class CriticalPathAnalyzer method createCriticalPath.

private void createCriticalPath(DagInfo dagInfo, TaskAttemptInfo lastAttempt, long lastAttemptFinishTime, Map<String, TaskAttemptInfo> attempts) {
    List<CriticalPathStep> tempCP = Lists.newLinkedList();
    if (lastAttempt != null) {
        TaskAttemptInfo currentAttempt = lastAttempt;
        CriticalPathStep currentStep = new CriticalPathStep(currentAttempt, EntityType.DAG_COMMIT);
        long currentAttemptStopCriticalPathTime = lastAttemptFinishTime;
        // add the commit step
        if (dagInfo.getFinishTime() > 0) {
            currentStep.stopCriticalPathTime = dagInfo.getFinishTime();
        } else {
            // AM crashed and no dag finished written
            currentStep.stopCriticalPathTime = currentAttemptStopCriticalPathTime;
        }
        currentStep.startCriticalPathTime = currentAttemptStopCriticalPathTime;
        currentStep.reason = CriticalPathDependency.COMMIT_DEPENDENCY;
        tempCP.add(currentStep);
        while (true) {
            Preconditions.checkState(currentAttempt != null);
            Preconditions.checkState(currentAttemptStopCriticalPathTime > 0);
            LOG.debug("Step: " + tempCP.size() + " Attempt: " + currentAttempt.getTaskAttemptId());
            currentStep = new CriticalPathStep(currentAttempt, EntityType.ATTEMPT);
            currentStep.stopCriticalPathTime = currentAttemptStopCriticalPathTime;
            // consider the last data event seen immediately preceding the current critical path
            // stop time for this attempt
            long currentStepLastDataEventTime = 0;
            String currentStepLastDataTA = null;
            DataDependencyEvent item = currentAttempt.getLastDataEventInfo(currentStep.stopCriticalPathTime);
            if (item != null) {
                currentStepLastDataEventTime = item.getTimestamp();
                currentStepLastDataTA = item.getTaskAttemptId();
            }
            // sanity check
            for (CriticalPathStep previousStep : tempCP) {
                if (previousStep.type == EntityType.ATTEMPT) {
                    if (previousStep.attempt.getTaskAttemptId().equals(currentAttempt.getTaskAttemptId())) {
                        // found loop.
                        // this should only happen for read errors in currentAttempt
                        List<DataDependencyEvent> dataEvents = currentAttempt.getLastDataEvents();
                        // received
                        Preconditions.checkState(dataEvents.size() > 1);
                        // original and
                        // retry data events
                        Preconditions.checkState(currentStepLastDataEventTime < dataEvents.get(dataEvents.size() - 1).getTimestamp());
                    // earlier than
                    // last
                    }
                }
            }
            tempCP.add(currentStep);
            // find the next attempt on the critical path
            boolean dataDependency = false;
            // find out predecessor dependency
            if (currentStepLastDataEventTime > currentAttempt.getCreationTime()) {
                dataDependency = true;
            }
            long startCriticalPathTime = 0;
            String nextAttemptId = null;
            CriticalPathDependency reason = null;
            if (dataDependency) {
                // last data event was produced after the attempt was scheduled. use
                // data dependency
                // typically the case when scheduling ahead of time
                LOG.debug("Has data dependency");
                if (!Strings.isNullOrEmpty(currentStepLastDataTA)) {
                    // there is a valid data causal TA. Use it.
                    nextAttemptId = currentStepLastDataTA;
                    reason = CriticalPathDependency.DATA_DEPENDENCY;
                    startCriticalPathTime = currentStepLastDataEventTime;
                    LOG.debug("Using data dependency " + nextAttemptId);
                } else {
                    // there is no valid data causal TA. This means data event came from the same vertex
                    VertexInfo vertex = currentAttempt.getTaskInfo().getVertexInfo();
                    Preconditions.checkState(!vertex.getAdditionalInputInfoList().isEmpty(), "Vertex: " + vertex.getVertexId() + " has no external inputs but the last data event " + "TA is null for " + currentAttempt.getTaskAttemptId());
                    nextAttemptId = null;
                    reason = CriticalPathDependency.INIT_DEPENDENCY;
                    LOG.debug("Using init dependency");
                }
            } else {
                // attempt was scheduled after last data event. use scheduling dependency
                // typically happens for retries
                LOG.debug("Has scheduling dependency");
                if (!Strings.isNullOrEmpty(currentAttempt.getCreationCausalTA())) {
                    // there is a scheduling causal TA. Use it.
                    nextAttemptId = currentAttempt.getCreationCausalTA();
                    reason = CriticalPathDependency.RETRY_DEPENDENCY;
                    TaskAttemptInfo nextAttempt = attempts.get(nextAttemptId);
                    if (nextAttemptId != null) {
                        VertexInfo currentVertex = currentAttempt.getTaskInfo().getVertexInfo();
                        VertexInfo nextVertex = nextAttempt.getTaskInfo().getVertexInfo();
                        if (!nextVertex.getVertexName().equals(currentVertex.getVertexName())) {
                            // cause from different vertex. Might be rerun to re-generate outputs
                            for (VertexInfo outVertex : currentVertex.getOutputVertices()) {
                                if (nextVertex.getVertexName().equals(outVertex.getVertexName())) {
                                    // next vertex is an output vertex
                                    reason = CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY;
                                    break;
                                }
                            }
                        }
                    }
                    if (reason == CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY) {
                        // rescheduled due to read error. start critical at read error report time.
                        // for now proxy own creation time for read error report time
                        startCriticalPathTime = currentAttempt.getCreationTime();
                    } else {
                        // rescheduled due to own previous attempt failure
                        // we are critical when the previous attempt fails
                        Preconditions.checkState(nextAttempt != null);
                        Preconditions.checkState(nextAttempt.getTaskInfo().getTaskId().equals(currentAttempt.getTaskInfo().getTaskId()));
                        startCriticalPathTime = nextAttempt.getFinishTime();
                    }
                    LOG.debug("Using scheduling dependency " + nextAttemptId);
                } else {
                    // there is no scheduling causal TA.
                    if (!Strings.isNullOrEmpty(currentStepLastDataTA)) {
                        // there is a data event going to the vertex. Count the time between data event and
                        // creation time as Initializer/Manager overhead and follow data dependency
                        nextAttemptId = currentStepLastDataTA;
                        reason = CriticalPathDependency.DATA_DEPENDENCY;
                        startCriticalPathTime = currentStepLastDataEventTime;
                        long overhead = currentAttempt.getCreationTime() - currentStepLastDataEventTime;
                        currentStep.notes.add("Initializer/VertexManager scheduling overhead " + SVGUtils.getTimeStr(overhead));
                        LOG.debug("Using data dependency " + nextAttemptId);
                    } else {
                        // there is no scheduling causal TA and no data event casual TA.
                        // the vertex has external input that sent the last data events
                        // or the vertex has external input but does not use events
                        // or the vertex has no external inputs or edges
                        nextAttemptId = null;
                        reason = CriticalPathDependency.INIT_DEPENDENCY;
                        LOG.debug("Using init dependency");
                    }
                }
            }
            currentStep.startCriticalPathTime = startCriticalPathTime;
            currentStep.reason = reason;
            Preconditions.checkState(currentStep.stopCriticalPathTime >= currentStep.startCriticalPathTime);
            if (Strings.isNullOrEmpty(nextAttemptId)) {
                Preconditions.checkState(reason.equals(CriticalPathDependency.INIT_DEPENDENCY));
                Preconditions.checkState(startCriticalPathTime == 0);
                // no predecessor attempt found. this is the last step in the critical path
                // assume attempts start critical path time is when its scheduled. before that is
                // vertex initialization time
                currentStep.startCriticalPathTime = currentStep.attempt.getCreationTime();
                // add vertex init step
                long initStepStopCriticalTime = currentStep.startCriticalPathTime;
                currentStep = new CriticalPathStep(currentAttempt, EntityType.VERTEX_INIT);
                currentStep.stopCriticalPathTime = initStepStopCriticalTime;
                currentStep.startCriticalPathTime = dagInfo.getStartTime();
                currentStep.reason = CriticalPathDependency.INIT_DEPENDENCY;
                tempCP.add(currentStep);
                if (!tempCP.isEmpty()) {
                    for (int i = tempCP.size() - 1; i >= 0; --i) {
                        criticalPath.add(tempCP.get(i));
                    }
                }
                return;
            }
            currentAttempt = attempts.get(nextAttemptId);
            currentAttemptStopCriticalPathTime = startCriticalPathTime;
        }
    }
}
Also used : DataDependencyEvent(org.apache.tez.history.parser.datamodel.TaskAttemptInfo.DataDependencyEvent) TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo)

Example 10 with TaskAttemptInfo

use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.

the class SkewAnalyzer method analyzeReducers.

private void analyzeReducers(DagInfo dagInfo) {
    for (VertexInfo vertexInfo : dagInfo.getVertices()) {
        for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
            analyzeGroupSkewPerSource(attemptInfo);
            analyzeRecordSkewPerSource(attemptInfo);
            analyzeForParallelism(attemptInfo);
        }
    }
}
Also used : TaskAttemptInfo(org.apache.tez.history.parser.datamodel.TaskAttemptInfo) VertexInfo(org.apache.tez.history.parser.datamodel.VertexInfo)

Aggregations

TaskAttemptInfo (org.apache.tez.history.parser.datamodel.TaskAttemptInfo)23 VertexInfo (org.apache.tez.history.parser.datamodel.VertexInfo)15 TaskInfo (org.apache.tez.history.parser.datamodel.TaskInfo)6 Map (java.util.Map)5 TezCounter (org.apache.tez.common.counters.TezCounter)5 Container (org.apache.tez.history.parser.datamodel.Container)2 DataDependencyEvent (org.apache.tez.history.parser.datamodel.TaskAttemptInfo.DataDependencyEvent)2 HashMap (java.util.HashMap)1 Scanner (java.util.Scanner)1 DAGCounter (org.apache.tez.common.counters.DAGCounter)1 FileSystemCounter (org.apache.tez.common.counters.FileSystemCounter)1 TaskCounter (org.apache.tez.common.counters.TaskCounter)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TezException (org.apache.tez.dag.api.TezException)1 TezDAGID (org.apache.tez.dag.records.TezDAGID)1 TezTaskAttemptID (org.apache.tez.dag.records.TezTaskAttemptID)1 TezTaskID (org.apache.tez.dag.records.TezTaskID)1 TezVertexID (org.apache.tez.dag.records.TezVertexID)1 WordCount (org.apache.tez.examples.WordCount)1 DagInfo (org.apache.tez.history.parser.datamodel.DagInfo)1