use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class SlowestVertexAnalyzer method analyze.
@Override
public void analyze(DagInfo dagInfo) throws TezException {
for (VertexInfo vertexInfo : dagInfo.getVertices()) {
String vertexName = vertexInfo.getVertexName();
if (vertexInfo.getFirstTaskToStart() == null || vertexInfo.getLastTaskToFinish() == null) {
continue;
}
long totalTime = getTaskRuntime(vertexInfo);
long slowestLastEventTime = Long.MIN_VALUE;
String maxSourceName = "";
taskAttemptRuntimeHistorgram = metrics.histogram(vertexName);
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
taskAttemptRuntimeHistorgram.update(attemptInfo.getTimeTaken());
// Get the last event received from the incoming vertices
Map<String, TezCounter> lastEventReceivedMap = attemptInfo.getCounter(TaskCounter.LAST_EVENT_RECEIVED.toString());
for (Map.Entry<String, TezCounter> entry : lastEventReceivedMap.entrySet()) {
if (entry.getKey().equals(TaskCounter.class.getName())) {
// getting TaskCounter details as well.
continue;
}
// Find the slowest last event received
if (entry.getValue().getValue() > slowestLastEventTime) {
slowestLastEventTime = entry.getValue().getValue();
maxSourceName = entry.getKey();
}
}
}
long shuffleMax = Long.MIN_VALUE;
String shuffleMaxSource = "";
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
// Get the last event received from the incoming vertices
Map<String, TezCounter> lastEventReceivedMap = attemptInfo.getCounter(TaskCounter.SHUFFLE_PHASE_TIME.toString());
for (Map.Entry<String, TezCounter> entry : lastEventReceivedMap.entrySet()) {
if (entry.getKey().equals(TaskCounter.class.getName())) {
// ignore. TODO: hack for taskcounter issue
continue;
}
// Find the slowest last event received
if (entry.getValue().getValue() > shuffleMax) {
shuffleMax = entry.getValue().getValue();
shuffleMaxSource = entry.getKey();
}
}
}
String comments = "";
List<String> record = Lists.newLinkedList();
record.add(vertexName);
record.add(vertexInfo.getTaskAttempts().size() + "");
record.add(totalTime + "");
record.add(Math.max(0, shuffleMax) + "");
record.add(shuffleMaxSource);
record.add(Math.max(0, slowestLastEventTime) + "");
record.add(maxSourceName);
// Finding out real_work done at vertex level might be meaningless (as it is quite posisble
// that it went to starvation).
StringBuilder sb = new StringBuilder();
double percentile75 = taskAttemptRuntimeHistorgram.getSnapshot().get75thPercentile();
double percentile95 = taskAttemptRuntimeHistorgram.getSnapshot().get95thPercentile();
double percentile98 = taskAttemptRuntimeHistorgram.getSnapshot().get98thPercentile();
double percentile99 = taskAttemptRuntimeHistorgram.getSnapshot().get99thPercentile();
double medianAttemptRuntime = taskAttemptRuntimeHistorgram.getSnapshot().getMedian();
record.add("75th=" + percentile75);
record.add("95th=" + percentile95);
record.add("98th=" + percentile98);
record.add("median=" + medianAttemptRuntime);
if (percentile75 / percentile99 < 0.5) {
// looks like some straggler task is there.
sb.append("Looks like some straggler task is there");
}
record.add(sb.toString());
if (totalTime > 0 && vertexInfo.getTaskAttempts().size() > 0) {
if ((shuffleMax * 1.0f / totalTime) > 0.5) {
if ((slowestLastEventTime * 1.0f / totalTime) > 0.5) {
comments = "This vertex is slow due to its dependency on parent. Got a lot delayed last" + " event received";
} else {
comments = "Spending too much time on shuffle. Check shuffle bytes from previous vertex";
}
} else {
if (totalTime > vertexRuntimeThreshold) {
// greater than X seconds.
comments = "Concentrate on this vertex (totalTime > " + vertexRuntimeThreshold + " seconds)";
}
}
}
record.add(comments);
csvResult.addRecord(record.toArray(new String[record.size()]));
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class CriticalPathAnalyzer method analyze.
@Override
public void analyze(DagInfo dagInfo) throws TezException {
// get all attempts in the dag and find the last failed/succeeded attempt.
// ignore killed attempt to handle kills that happen upon dag completion
TaskAttemptInfo lastAttempt = null;
long lastAttemptFinishTime = 0;
for (VertexInfo vertex : dagInfo.getVertices()) {
for (TaskInfo task : vertex.getTasks()) {
for (TaskAttemptInfo attempt : task.getTaskAttempts()) {
attempts.put(attempt.getTaskAttemptId(), attempt);
if (attempt.getStatus().equals(succeededState) || attempt.getStatus().equals(failedState)) {
if (lastAttemptFinishTime < attempt.getFinishTime()) {
lastAttempt = attempt;
lastAttemptFinishTime = attempt.getFinishTime();
}
}
}
}
}
if (lastAttempt == null) {
LOG.info("Cannot find last attempt to finish in DAG " + dagInfo.getDagId());
return;
}
createCriticalPath(dagInfo, lastAttempt, lastAttemptFinishTime, attempts);
analyzeCriticalPath(dagInfo);
if (getConf().getBoolean(DRAW_SVG, true)) {
saveCriticalPathAsSVG(dagInfo);
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class CriticalPathAnalyzer method analyzeStragglers.
private void analyzeStragglers(DagInfo dag) {
long dagStartTime = dag.getStartTime();
long dagTime = dag.getFinishTime() - dagStartTime;
long totalAttemptCriticalTime = 0;
for (int i = 0; i < criticalPath.size(); ++i) {
CriticalPathStep step = criticalPath.get(i);
totalAttemptCriticalTime += (step.stopCriticalPathTime - step.startCriticalPathTime);
TaskAttemptInfo attempt = step.attempt;
if (step.getType() == EntityType.ATTEMPT) {
// analyze execution overhead
if (attempt.getLastDataEvents().size() > 1) {
// there were read errors. that could have delayed the attempt. ignore this
continue;
}
long avgPostDataExecutionTime = attempt.getTaskInfo().getVertexInfo().getAvgPostDataExecutionTimeInterval();
if (avgPostDataExecutionTime <= 0) {
continue;
}
long attemptExecTime = attempt.getPostDataExecutionTimeInterval();
if (avgPostDataExecutionTime * 1.25 < attemptExecTime) {
step.notes.add("Potential straggler. Post Data Execution time " + SVGUtils.getTimeStr(attemptExecTime) + " compared to vertex average of " + SVGUtils.getTimeStr(avgPostDataExecutionTime));
}
}
}
LOG.debug("DAG time taken: " + dagTime + " TotalAttemptTime: " + totalAttemptCriticalTime + " DAG finish time: " + dag.getFinishTime() + " DAG start time: " + dagStartTime);
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class CriticalPathAnalyzer method createCriticalPath.
private void createCriticalPath(DagInfo dagInfo, TaskAttemptInfo lastAttempt, long lastAttemptFinishTime, Map<String, TaskAttemptInfo> attempts) {
List<CriticalPathStep> tempCP = Lists.newLinkedList();
if (lastAttempt != null) {
TaskAttemptInfo currentAttempt = lastAttempt;
CriticalPathStep currentStep = new CriticalPathStep(currentAttempt, EntityType.DAG_COMMIT);
long currentAttemptStopCriticalPathTime = lastAttemptFinishTime;
// add the commit step
if (dagInfo.getFinishTime() > 0) {
currentStep.stopCriticalPathTime = dagInfo.getFinishTime();
} else {
// AM crashed and no dag finished written
currentStep.stopCriticalPathTime = currentAttemptStopCriticalPathTime;
}
currentStep.startCriticalPathTime = currentAttemptStopCriticalPathTime;
currentStep.reason = CriticalPathDependency.COMMIT_DEPENDENCY;
tempCP.add(currentStep);
while (true) {
Preconditions.checkState(currentAttempt != null);
Preconditions.checkState(currentAttemptStopCriticalPathTime > 0);
LOG.debug("Step: " + tempCP.size() + " Attempt: " + currentAttempt.getTaskAttemptId());
currentStep = new CriticalPathStep(currentAttempt, EntityType.ATTEMPT);
currentStep.stopCriticalPathTime = currentAttemptStopCriticalPathTime;
// consider the last data event seen immediately preceding the current critical path
// stop time for this attempt
long currentStepLastDataEventTime = 0;
String currentStepLastDataTA = null;
DataDependencyEvent item = currentAttempt.getLastDataEventInfo(currentStep.stopCriticalPathTime);
if (item != null) {
currentStepLastDataEventTime = item.getTimestamp();
currentStepLastDataTA = item.getTaskAttemptId();
}
// sanity check
for (CriticalPathStep previousStep : tempCP) {
if (previousStep.type == EntityType.ATTEMPT) {
if (previousStep.attempt.getTaskAttemptId().equals(currentAttempt.getTaskAttemptId())) {
// found loop.
// this should only happen for read errors in currentAttempt
List<DataDependencyEvent> dataEvents = currentAttempt.getLastDataEvents();
// received
Preconditions.checkState(dataEvents.size() > 1);
// original and
// retry data events
Preconditions.checkState(currentStepLastDataEventTime < dataEvents.get(dataEvents.size() - 1).getTimestamp());
// earlier than
// last
}
}
}
tempCP.add(currentStep);
// find the next attempt on the critical path
boolean dataDependency = false;
// find out predecessor dependency
if (currentStepLastDataEventTime > currentAttempt.getCreationTime()) {
dataDependency = true;
}
long startCriticalPathTime = 0;
String nextAttemptId = null;
CriticalPathDependency reason = null;
if (dataDependency) {
// last data event was produced after the attempt was scheduled. use
// data dependency
// typically the case when scheduling ahead of time
LOG.debug("Has data dependency");
if (!Strings.isNullOrEmpty(currentStepLastDataTA)) {
// there is a valid data causal TA. Use it.
nextAttemptId = currentStepLastDataTA;
reason = CriticalPathDependency.DATA_DEPENDENCY;
startCriticalPathTime = currentStepLastDataEventTime;
LOG.debug("Using data dependency " + nextAttemptId);
} else {
// there is no valid data causal TA. This means data event came from the same vertex
VertexInfo vertex = currentAttempt.getTaskInfo().getVertexInfo();
Preconditions.checkState(!vertex.getAdditionalInputInfoList().isEmpty(), "Vertex: " + vertex.getVertexId() + " has no external inputs but the last data event " + "TA is null for " + currentAttempt.getTaskAttemptId());
nextAttemptId = null;
reason = CriticalPathDependency.INIT_DEPENDENCY;
LOG.debug("Using init dependency");
}
} else {
// attempt was scheduled after last data event. use scheduling dependency
// typically happens for retries
LOG.debug("Has scheduling dependency");
if (!Strings.isNullOrEmpty(currentAttempt.getCreationCausalTA())) {
// there is a scheduling causal TA. Use it.
nextAttemptId = currentAttempt.getCreationCausalTA();
reason = CriticalPathDependency.RETRY_DEPENDENCY;
TaskAttemptInfo nextAttempt = attempts.get(nextAttemptId);
if (nextAttemptId != null) {
VertexInfo currentVertex = currentAttempt.getTaskInfo().getVertexInfo();
VertexInfo nextVertex = nextAttempt.getTaskInfo().getVertexInfo();
if (!nextVertex.getVertexName().equals(currentVertex.getVertexName())) {
// cause from different vertex. Might be rerun to re-generate outputs
for (VertexInfo outVertex : currentVertex.getOutputVertices()) {
if (nextVertex.getVertexName().equals(outVertex.getVertexName())) {
// next vertex is an output vertex
reason = CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY;
break;
}
}
}
}
if (reason == CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY) {
// rescheduled due to read error. start critical at read error report time.
// for now proxy own creation time for read error report time
startCriticalPathTime = currentAttempt.getCreationTime();
} else {
// rescheduled due to own previous attempt failure
// we are critical when the previous attempt fails
Preconditions.checkState(nextAttempt != null);
Preconditions.checkState(nextAttempt.getTaskInfo().getTaskId().equals(currentAttempt.getTaskInfo().getTaskId()));
startCriticalPathTime = nextAttempt.getFinishTime();
}
LOG.debug("Using scheduling dependency " + nextAttemptId);
} else {
// there is no scheduling causal TA.
if (!Strings.isNullOrEmpty(currentStepLastDataTA)) {
// there is a data event going to the vertex. Count the time between data event and
// creation time as Initializer/Manager overhead and follow data dependency
nextAttemptId = currentStepLastDataTA;
reason = CriticalPathDependency.DATA_DEPENDENCY;
startCriticalPathTime = currentStepLastDataEventTime;
long overhead = currentAttempt.getCreationTime() - currentStepLastDataEventTime;
currentStep.notes.add("Initializer/VertexManager scheduling overhead " + SVGUtils.getTimeStr(overhead));
LOG.debug("Using data dependency " + nextAttemptId);
} else {
// there is no scheduling causal TA and no data event casual TA.
// the vertex has external input that sent the last data events
// or the vertex has external input but does not use events
// or the vertex has no external inputs or edges
nextAttemptId = null;
reason = CriticalPathDependency.INIT_DEPENDENCY;
LOG.debug("Using init dependency");
}
}
}
currentStep.startCriticalPathTime = startCriticalPathTime;
currentStep.reason = reason;
Preconditions.checkState(currentStep.stopCriticalPathTime >= currentStep.startCriticalPathTime);
if (Strings.isNullOrEmpty(nextAttemptId)) {
Preconditions.checkState(reason.equals(CriticalPathDependency.INIT_DEPENDENCY));
Preconditions.checkState(startCriticalPathTime == 0);
// no predecessor attempt found. this is the last step in the critical path
// assume attempts start critical path time is when its scheduled. before that is
// vertex initialization time
currentStep.startCriticalPathTime = currentStep.attempt.getCreationTime();
// add vertex init step
long initStepStopCriticalTime = currentStep.startCriticalPathTime;
currentStep = new CriticalPathStep(currentAttempt, EntityType.VERTEX_INIT);
currentStep.stopCriticalPathTime = initStepStopCriticalTime;
currentStep.startCriticalPathTime = dagInfo.getStartTime();
currentStep.reason = CriticalPathDependency.INIT_DEPENDENCY;
tempCP.add(currentStep);
if (!tempCP.isEmpty()) {
for (int i = tempCP.size() - 1; i >= 0; --i) {
criticalPath.add(tempCP.get(i));
}
}
return;
}
currentAttempt = attempts.get(nextAttemptId);
currentAttemptStopCriticalPathTime = startCriticalPathTime;
}
}
}
use of org.apache.tez.history.parser.datamodel.TaskAttemptInfo in project tez by apache.
the class SkewAnalyzer method analyzeReducers.
private void analyzeReducers(DagInfo dagInfo) {
for (VertexInfo vertexInfo : dagInfo.getVertices()) {
for (TaskAttemptInfo attemptInfo : vertexInfo.getTaskAttempts()) {
analyzeGroupSkewPerSource(attemptInfo);
analyzeRecordSkewPerSource(attemptInfo);
analyzeForParallelism(attemptInfo);
}
}
}
Aggregations