Search in sources :

Example 1 with PipelineStats

use of io.trino.operator.PipelineStats in project trino by trinodb.

the class StageStateMachine method getStageInfo.

public StageInfo getStageInfo(Supplier<Iterable<TaskInfo>> taskInfosSupplier) {
    Optional<StageInfo> finalStageInfo = this.finalStageInfo.get();
    if (finalStageInfo.isPresent()) {
        return finalStageInfo.get();
    }
    // stage state must be captured first in order to provide a
    // consistent view of the stage. For example, building this
    // information, the stage could finish, and the task states would
    // never be visible.
    StageState state = stageState.get();
    List<TaskInfo> taskInfos = ImmutableList.copyOf(taskInfosSupplier.get());
    int totalTasks = taskInfos.size();
    int runningTasks = 0;
    int completedTasks = 0;
    int failedTasks = 0;
    int totalDrivers = 0;
    int queuedDrivers = 0;
    int runningDrivers = 0;
    int blockedDrivers = 0;
    int completedDrivers = 0;
    long cumulativeUserMemory = 0;
    long failedCumulativeUserMemory = 0;
    long userMemoryReservation = 0;
    long revocableMemoryReservation = 0;
    long totalMemoryReservation = 0;
    long peakUserMemoryReservation = peakUserMemory.get();
    long peakRevocableMemoryReservation = peakRevocableMemory.get();
    long totalScheduledTime = 0;
    long failedScheduledTime = 0;
    long totalCpuTime = 0;
    long failedCpuTime = 0;
    long totalBlockedTime = 0;
    long physicalInputDataSize = 0;
    long failedPhysicalInputDataSize = 0;
    long physicalInputPositions = 0;
    long failedPhysicalInputPositions = 0;
    long physicalInputReadTime = 0;
    long failedPhysicalInputReadTime = 0;
    long internalNetworkInputDataSize = 0;
    long failedInternalNetworkInputDataSize = 0;
    long internalNetworkInputPositions = 0;
    long failedInternalNetworkInputPositions = 0;
    long rawInputDataSize = 0;
    long failedRawInputDataSize = 0;
    long rawInputPositions = 0;
    long failedRawInputPositions = 0;
    long processedInputDataSize = 0;
    long failedProcessedInputDataSize = 0;
    long processedInputPositions = 0;
    long failedProcessedInputPositions = 0;
    long bufferedDataSize = 0;
    long outputDataSize = 0;
    long failedOutputDataSize = 0;
    long outputPositions = 0;
    long failedOutputPositions = 0;
    long physicalWrittenDataSize = 0;
    long failedPhysicalWrittenDataSize = 0;
    int fullGcCount = 0;
    int fullGcTaskCount = 0;
    int minFullGcSec = 0;
    int maxFullGcSec = 0;
    int totalFullGcSec = 0;
    boolean fullyBlocked = true;
    Set<BlockedReason> blockedReasons = new HashSet<>();
    Map<String, OperatorStats> operatorToStats = new HashMap<>();
    for (TaskInfo taskInfo : taskInfos) {
        TaskState taskState = taskInfo.getTaskStatus().getState();
        if (taskState.isDone()) {
            completedTasks++;
        } else {
            runningTasks++;
        }
        if (taskState == TaskState.FAILED) {
            failedTasks++;
        }
        TaskStats taskStats = taskInfo.getStats();
        totalDrivers += taskStats.getTotalDrivers();
        queuedDrivers += taskStats.getQueuedDrivers();
        runningDrivers += taskStats.getRunningDrivers();
        blockedDrivers += taskStats.getBlockedDrivers();
        completedDrivers += taskStats.getCompletedDrivers();
        cumulativeUserMemory += taskStats.getCumulativeUserMemory();
        if (taskState == TaskState.FAILED) {
            failedCumulativeUserMemory += taskStats.getCumulativeUserMemory();
        }
        long taskUserMemory = taskStats.getUserMemoryReservation().toBytes();
        long taskRevocableMemory = taskStats.getRevocableMemoryReservation().toBytes();
        userMemoryReservation += taskUserMemory;
        revocableMemoryReservation += taskRevocableMemory;
        totalMemoryReservation += taskUserMemory + taskRevocableMemory;
        totalScheduledTime += taskStats.getTotalScheduledTime().roundTo(NANOSECONDS);
        totalCpuTime += taskStats.getTotalCpuTime().roundTo(NANOSECONDS);
        totalBlockedTime += taskStats.getTotalBlockedTime().roundTo(NANOSECONDS);
        if (taskState == TaskState.FAILED) {
            failedScheduledTime += taskStats.getTotalScheduledTime().roundTo(NANOSECONDS);
            failedCpuTime += taskStats.getTotalCpuTime().roundTo(NANOSECONDS);
        }
        if (!taskState.isDone()) {
            fullyBlocked &= taskStats.isFullyBlocked();
            blockedReasons.addAll(taskStats.getBlockedReasons());
        }
        physicalInputDataSize += taskStats.getPhysicalInputDataSize().toBytes();
        physicalInputPositions += taskStats.getPhysicalInputPositions();
        physicalInputReadTime += taskStats.getPhysicalInputReadTime().roundTo(NANOSECONDS);
        internalNetworkInputDataSize += taskStats.getInternalNetworkInputDataSize().toBytes();
        internalNetworkInputPositions += taskStats.getInternalNetworkInputPositions();
        rawInputDataSize += taskStats.getRawInputDataSize().toBytes();
        rawInputPositions += taskStats.getRawInputPositions();
        processedInputDataSize += taskStats.getProcessedInputDataSize().toBytes();
        processedInputPositions += taskStats.getProcessedInputPositions();
        bufferedDataSize += taskInfo.getOutputBuffers().getTotalBufferedBytes();
        outputDataSize += taskStats.getOutputDataSize().toBytes();
        outputPositions += taskStats.getOutputPositions();
        physicalWrittenDataSize += taskStats.getPhysicalWrittenDataSize().toBytes();
        if (taskState == TaskState.FAILED) {
            failedPhysicalInputDataSize += taskStats.getPhysicalInputDataSize().toBytes();
            failedPhysicalInputPositions += taskStats.getPhysicalInputPositions();
            failedPhysicalInputReadTime += taskStats.getPhysicalInputReadTime().roundTo(NANOSECONDS);
            failedInternalNetworkInputDataSize += taskStats.getInternalNetworkInputDataSize().toBytes();
            failedInternalNetworkInputPositions += taskStats.getInternalNetworkInputPositions();
            failedRawInputDataSize += taskStats.getRawInputDataSize().toBytes();
            failedRawInputPositions += taskStats.getRawInputPositions();
            failedProcessedInputDataSize += taskStats.getProcessedInputDataSize().toBytes();
            failedProcessedInputPositions += taskStats.getProcessedInputPositions();
            failedOutputDataSize += taskStats.getOutputDataSize().toBytes();
            failedOutputPositions += taskStats.getOutputPositions();
            failedPhysicalWrittenDataSize += taskStats.getPhysicalWrittenDataSize().toBytes();
        }
        fullGcCount += taskStats.getFullGcCount();
        fullGcTaskCount += taskStats.getFullGcCount() > 0 ? 1 : 0;
        int gcSec = toIntExact(taskStats.getFullGcTime().roundTo(SECONDS));
        totalFullGcSec += gcSec;
        minFullGcSec = min(minFullGcSec, gcSec);
        maxFullGcSec = max(maxFullGcSec, gcSec);
        for (PipelineStats pipeline : taskStats.getPipelines()) {
            for (OperatorStats operatorStats : pipeline.getOperatorSummaries()) {
                String id = pipeline.getPipelineId() + "." + operatorStats.getOperatorId();
                operatorToStats.compute(id, (k, v) -> v == null ? operatorStats : v.add(operatorStats));
            }
        }
    }
    StageStats stageStats = new StageStats(schedulingComplete.get(), getSplitDistribution.snapshot(), totalTasks, runningTasks, completedTasks, failedTasks, totalDrivers, queuedDrivers, runningDrivers, blockedDrivers, completedDrivers, cumulativeUserMemory, failedCumulativeUserMemory, succinctBytes(userMemoryReservation), succinctBytes(revocableMemoryReservation), succinctBytes(totalMemoryReservation), succinctBytes(peakUserMemoryReservation), succinctBytes(peakRevocableMemoryReservation), succinctDuration(totalScheduledTime, NANOSECONDS), succinctDuration(failedScheduledTime, NANOSECONDS), succinctDuration(totalCpuTime, NANOSECONDS), succinctDuration(failedCpuTime, NANOSECONDS), succinctDuration(totalBlockedTime, NANOSECONDS), fullyBlocked && runningTasks > 0, blockedReasons, succinctBytes(physicalInputDataSize), succinctBytes(failedPhysicalInputDataSize), physicalInputPositions, failedPhysicalInputPositions, succinctDuration(physicalInputReadTime, NANOSECONDS), succinctDuration(failedPhysicalInputReadTime, NANOSECONDS), succinctBytes(internalNetworkInputDataSize), succinctBytes(failedInternalNetworkInputDataSize), internalNetworkInputPositions, failedInternalNetworkInputPositions, succinctBytes(rawInputDataSize), succinctBytes(failedRawInputDataSize), rawInputPositions, failedRawInputPositions, succinctBytes(processedInputDataSize), succinctBytes(failedProcessedInputDataSize), processedInputPositions, failedProcessedInputPositions, succinctBytes(bufferedDataSize), succinctBytes(outputDataSize), succinctBytes(failedOutputDataSize), outputPositions, failedOutputPositions, succinctBytes(physicalWrittenDataSize), succinctBytes(failedPhysicalWrittenDataSize), new StageGcStatistics(stageId.getId(), totalTasks, fullGcTaskCount, minFullGcSec, maxFullGcSec, totalFullGcSec, (int) (1.0 * totalFullGcSec / fullGcCount)), ImmutableList.copyOf(operatorToStats.values()));
    ExecutionFailureInfo failureInfo = null;
    if (state == FAILED) {
        failureInfo = failureCause.get();
    }
    return new StageInfo(stageId, state, fragment, fragment.getPartitioning().isCoordinatorOnly(), fragment.getTypes(), stageStats, taskInfos, ImmutableList.of(), tables, failureInfo);
}
Also used : PipelineStats(io.trino.operator.PipelineStats) BlockedReason(io.trino.operator.BlockedReason) HashMap(java.util.HashMap) OperatorStats(io.trino.operator.OperatorStats) TaskStats(io.trino.operator.TaskStats) StageGcStatistics(io.trino.spi.eventlistener.StageGcStatistics) HashSet(java.util.HashSet)

Example 2 with PipelineStats

use of io.trino.operator.PipelineStats in project trino by trinodb.

the class PlanNodeStatsSummarizer method getPlanNodeStats.

private static List<PlanNodeStats> getPlanNodeStats(TaskStats taskStats) {
    // Best effort to reconstruct the plan nodes from operators.
    // Because stats are collected separately from query execution,
    // it's possible that some or all of them are missing or out of date.
    // For example, a LIMIT clause can cause a query to finish before stats
    // are collected from the leaf stages.
    Set<PlanNodeId> planNodeIds = new HashSet<>();
    Map<PlanNodeId, Long> planNodeInputPositions = new HashMap<>();
    Map<PlanNodeId, Long> planNodeInputBytes = new HashMap<>();
    Map<PlanNodeId, Long> planNodeOutputPositions = new HashMap<>();
    Map<PlanNodeId, Long> planNodeOutputBytes = new HashMap<>();
    Map<PlanNodeId, Long> planNodeSpilledDataSize = new HashMap<>();
    Map<PlanNodeId, Long> planNodeScheduledMillis = new HashMap<>();
    Map<PlanNodeId, Long> planNodeCpuMillis = new HashMap<>();
    Map<PlanNodeId, Map<String, BasicOperatorStats>> basicOperatorStats = new HashMap<>();
    Map<PlanNodeId, Map<String, OperatorHashCollisionsStats>> operatorHashCollisionsStats = new HashMap<>();
    Map<PlanNodeId, WindowOperatorStats> windowNodeStats = new HashMap<>();
    for (PipelineStats pipelineStats : taskStats.getPipelines()) {
        // Due to eventual consistently collected stats, these could be empty
        if (pipelineStats.getOperatorSummaries().isEmpty()) {
            continue;
        }
        Set<PlanNodeId> processedNodes = new HashSet<>();
        PlanNodeId inputPlanNode = pipelineStats.getOperatorSummaries().iterator().next().getPlanNodeId();
        PlanNodeId outputPlanNode = getLast(pipelineStats.getOperatorSummaries()).getPlanNodeId();
        // Gather input statistics
        for (OperatorStats operatorStats : pipelineStats.getOperatorSummaries()) {
            PlanNodeId planNodeId = operatorStats.getPlanNodeId();
            planNodeIds.add(planNodeId);
            long scheduledMillis = operatorStats.getAddInputWall().toMillis() + operatorStats.getGetOutputWall().toMillis() + operatorStats.getFinishWall().toMillis();
            planNodeScheduledMillis.merge(planNodeId, scheduledMillis, Long::sum);
            long cpuMillis = operatorStats.getAddInputCpu().toMillis() + operatorStats.getGetOutputCpu().toMillis() + operatorStats.getFinishCpu().toMillis();
            planNodeCpuMillis.merge(planNodeId, cpuMillis, Long::sum);
            // A pipeline like hash build before join might link to another "internal" pipelines which provide actual input for this plan node
            if (operatorStats.getPlanNodeId().equals(inputPlanNode) && !pipelineStats.isInputPipeline()) {
                continue;
            }
            if (processedNodes.contains(planNodeId)) {
                continue;
            }
            basicOperatorStats.merge(planNodeId, ImmutableMap.of(operatorStats.getOperatorType(), new BasicOperatorStats(operatorStats.getTotalDrivers(), operatorStats.getInputPositions(), operatorStats.getSumSquaredInputPositions(), operatorStats.getMetrics(), operatorStats.getConnectorMetrics())), (map1, map2) -> mergeMaps(map1, map2, BasicOperatorStats::merge));
            planNodeInputPositions.merge(planNodeId, operatorStats.getInputPositions(), Long::sum);
            planNodeInputBytes.merge(planNodeId, operatorStats.getInputDataSize().toBytes(), Long::sum);
            planNodeSpilledDataSize.merge(planNodeId, operatorStats.getSpilledDataSize().toBytes(), Long::sum);
            processedNodes.add(planNodeId);
        }
        // Gather output statistics
        processedNodes.clear();
        for (OperatorStats operatorStats : reverse(pipelineStats.getOperatorSummaries())) {
            PlanNodeId planNodeId = operatorStats.getPlanNodeId();
            // An "internal" pipeline like a hash build, links to another pipeline which is the actual output for this plan node
            if (operatorStats.getPlanNodeId().equals(outputPlanNode) && !pipelineStats.isOutputPipeline()) {
                continue;
            }
            if (processedNodes.contains(planNodeId)) {
                continue;
            }
            planNodeOutputPositions.merge(planNodeId, operatorStats.getOutputPositions(), Long::sum);
            planNodeOutputBytes.merge(planNodeId, operatorStats.getOutputDataSize().toBytes(), Long::sum);
            processedNodes.add(planNodeId);
        }
        // Gather auxiliary statistics
        for (OperatorStats operatorStats : pipelineStats.getOperatorSummaries()) {
            PlanNodeId planNodeId = operatorStats.getPlanNodeId();
            if (operatorStats.getInfo() instanceof HashCollisionsInfo) {
                HashCollisionsInfo hashCollisionsInfo = (HashCollisionsInfo) operatorStats.getInfo();
                operatorHashCollisionsStats.merge(planNodeId, ImmutableMap.of(operatorStats.getOperatorType(), new OperatorHashCollisionsStats(hashCollisionsInfo.getWeightedHashCollisions(), hashCollisionsInfo.getWeightedSumSquaredHashCollisions(), hashCollisionsInfo.getWeightedExpectedHashCollisions(), operatorStats.getInputPositions())), (map1, map2) -> mergeMaps(map1, map2, OperatorHashCollisionsStats::merge));
            }
            // The only statistics we have for Window Functions are very low level, thus displayed only in VERBOSE mode
            if (operatorStats.getInfo() instanceof WindowInfo) {
                WindowInfo windowInfo = (WindowInfo) operatorStats.getInfo();
                windowNodeStats.merge(planNodeId, WindowOperatorStats.create(windowInfo), WindowOperatorStats::mergeWith);
            }
        }
    }
    List<PlanNodeStats> stats = new ArrayList<>();
    for (PlanNodeId planNodeId : planNodeIds) {
        if (!planNodeInputPositions.containsKey(planNodeId)) {
            continue;
        }
        PlanNodeStats nodeStats;
        // It's possible there will be no output stats because all the pipelines that we observed were non-output.
        // For example in a query like SELECT * FROM a JOIN b ON c = d LIMIT 1
        // It's possible to observe stats after the build starts, but before the probe does
        // and therefore only have scheduled time, but no output stats
        long outputPositions = planNodeOutputPositions.getOrDefault(planNodeId, 0L);
        if (operatorHashCollisionsStats.containsKey(planNodeId)) {
            nodeStats = new HashCollisionPlanNodeStats(planNodeId, new Duration(planNodeScheduledMillis.get(planNodeId), MILLISECONDS), new Duration(planNodeCpuMillis.get(planNodeId), MILLISECONDS), planNodeInputPositions.get(planNodeId), succinctBytes(planNodeInputBytes.get(planNodeId)), outputPositions, succinctBytes(planNodeOutputBytes.getOrDefault(planNodeId, 0L)), succinctBytes(planNodeSpilledDataSize.get(planNodeId)), basicOperatorStats.get(planNodeId), operatorHashCollisionsStats.get(planNodeId));
        } else if (windowNodeStats.containsKey(planNodeId)) {
            nodeStats = new WindowPlanNodeStats(planNodeId, new Duration(planNodeScheduledMillis.get(planNodeId), MILLISECONDS), new Duration(planNodeCpuMillis.get(planNodeId), MILLISECONDS), planNodeInputPositions.get(planNodeId), succinctBytes(planNodeInputBytes.get(planNodeId)), outputPositions, succinctBytes(planNodeOutputBytes.getOrDefault(planNodeId, 0L)), succinctBytes(planNodeSpilledDataSize.get(planNodeId)), basicOperatorStats.get(planNodeId), windowNodeStats.get(planNodeId));
        } else {
            nodeStats = new PlanNodeStats(planNodeId, new Duration(planNodeScheduledMillis.get(planNodeId), MILLISECONDS), new Duration(planNodeCpuMillis.get(planNodeId), MILLISECONDS), planNodeInputPositions.get(planNodeId), succinctBytes(planNodeInputBytes.get(planNodeId)), outputPositions, succinctBytes(planNodeOutputBytes.getOrDefault(planNodeId, 0L)), succinctBytes(planNodeSpilledDataSize.get(planNodeId)), basicOperatorStats.get(planNodeId));
        }
        stats.add(nodeStats);
    }
    return stats;
}
Also used : PipelineStats(io.trino.operator.PipelineStats) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Duration(io.airlift.units.Duration) OperatorStats(io.trino.operator.OperatorStats) HashCollisionsInfo(io.trino.operator.HashCollisionsInfo) WindowInfo(io.trino.operator.WindowInfo) PlanNodeId(io.trino.sql.planner.plan.PlanNodeId) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

OperatorStats (io.trino.operator.OperatorStats)2 PipelineStats (io.trino.operator.PipelineStats)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Duration (io.airlift.units.Duration)1 BlockedReason (io.trino.operator.BlockedReason)1 HashCollisionsInfo (io.trino.operator.HashCollisionsInfo)1 TaskStats (io.trino.operator.TaskStats)1 WindowInfo (io.trino.operator.WindowInfo)1 StageGcStatistics (io.trino.spi.eventlistener.StageGcStatistics)1 PlanNodeId (io.trino.sql.planner.plan.PlanNodeId)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1