Search in sources :

Example 1 with ErrorCode

use of io.trino.spi.ErrorCode in project trino by trinodb.

the class QueryStateMachine method getBasicQueryInfo.

public BasicQueryInfo getBasicQueryInfo(Optional<BasicStageStats> rootStage) {
    // Query state must be captured first in order to provide a
    // correct view of the query.  For example, building this
    // information, the query could finish, and the task states would
    // never be visible.
    QueryState state = queryState.get();
    ErrorCode errorCode = null;
    if (state == FAILED) {
        ExecutionFailureInfo failureCause = this.failureCause.get();
        if (failureCause != null) {
            errorCode = failureCause.getErrorCode();
        }
    }
    BasicStageStats stageStats = rootStage.orElse(EMPTY_STAGE_STATS);
    BasicQueryStats queryStats = new BasicQueryStats(queryStateTimer.getCreateTime(), getEndTime().orElse(null), queryStateTimer.getQueuedTime(), queryStateTimer.getElapsedTime(), queryStateTimer.getExecutionTime(), stageStats.getTotalDrivers(), stageStats.getQueuedDrivers(), stageStats.getRunningDrivers(), stageStats.getCompletedDrivers(), stageStats.getRawInputDataSize(), stageStats.getRawInputPositions(), stageStats.getPhysicalInputDataSize(), stageStats.getCumulativeUserMemory(), stageStats.getFailedCumulativeUserMemory(), stageStats.getUserMemoryReservation(), stageStats.getTotalMemoryReservation(), succinctBytes(getPeakUserMemoryInBytes()), succinctBytes(getPeakTotalMemoryInBytes()), stageStats.getTotalCpuTime(), stageStats.getFailedCpuTime(), stageStats.getTotalScheduledTime(), stageStats.getFailedScheduledTime(), stageStats.isFullyBlocked(), stageStats.getBlockedReasons(), stageStats.getProgressPercentage());
    return new BasicQueryInfo(queryId, session.toSessionRepresentation(), Optional.of(resourceGroup), state, stageStats.isScheduled(), self, query, Optional.ofNullable(updateType.get()), preparedQuery, queryStats, errorCode == null ? null : errorCode.getType(), errorCode, queryType);
}
Also used : BasicQueryStats(io.trino.server.BasicQueryStats) BasicQueryInfo(io.trino.server.BasicQueryInfo) ErrorCode(io.trino.spi.ErrorCode)

Example 2 with ErrorCode

use of io.trino.spi.ErrorCode in project trino by trinodb.

the class QueryStateMachine method getQueryInfo.

@VisibleForTesting
QueryInfo getQueryInfo(Optional<StageInfo> rootStage) {
    // Query state must be captured first in order to provide a
    // correct view of the query.  For example, building this
    // information, the query could finish, and the task states would
    // never be visible.
    QueryState state = queryState.get();
    ExecutionFailureInfo failureCause = null;
    ErrorCode errorCode = null;
    if (state == FAILED) {
        failureCause = this.failureCause.get();
        if (failureCause != null) {
            errorCode = failureCause.getErrorCode();
        }
    }
    boolean completeInfo = getAllStages(rootStage).stream().allMatch(StageInfo::isCompleteInfo);
    boolean isScheduled = isScheduled(rootStage);
    return new QueryInfo(queryId, session.toSessionRepresentation(), state, isScheduled, self, outputManager.getQueryOutputInfo().map(QueryOutputInfo::getColumnNames).orElse(ImmutableList.of()), query, preparedQuery, getQueryStats(rootStage), Optional.ofNullable(setCatalog.get()), Optional.ofNullable(setSchema.get()), Optional.ofNullable(setPath.get()), setSessionProperties, resetSessionProperties, setRoles, addedPreparedStatements, deallocatedPreparedStatements, Optional.ofNullable(startedTransactionId.get()), clearTransactionId.get(), updateType.get(), rootStage, failureCause, errorCode, warningCollector.getWarnings(), inputs.get(), output.get(), referencedTables.get(), routines.get(), completeInfo, Optional.of(resourceGroup), queryType);
}
Also used : QueryOutputInfo(io.trino.execution.QueryExecution.QueryOutputInfo) ErrorCode(io.trino.spi.ErrorCode) BasicQueryInfo(io.trino.server.BasicQueryInfo) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with ErrorCode

use of io.trino.spi.ErrorCode in project trino by trinodb.

the class FaultTolerantStageScheduler method updateTaskStatus.

private void updateTaskStatus(TaskStatus taskStatus, Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle) {
    TaskState state = taskStatus.getState();
    if (!state.isDone()) {
        return;
    }
    try {
        RuntimeException failure = null;
        SettableFuture<Void> future;
        synchronized (this) {
            TaskId taskId = taskStatus.getTaskId();
            runningTasks.remove(taskId);
            future = taskFinishedFuture;
            if (!runningTasks.isEmpty()) {
                taskFinishedFuture = SettableFuture.create();
            } else {
                taskFinishedFuture = null;
            }
            NodeAllocator.NodeLease nodeLease = requireNonNull(runningNodes.remove(taskId), () -> "node not found for task id: " + taskId);
            nodeLease.release();
            int partitionId = taskId.getPartitionId();
            if (!finishedPartitions.contains(partitionId) && !closed) {
                switch(state) {
                    case FINISHED:
                        finishedPartitions.add(partitionId);
                        if (sinkExchange.isPresent()) {
                            checkArgument(exchangeSinkInstanceHandle.isPresent(), "exchangeSinkInstanceHandle is expected to be present");
                            sinkExchange.get().sinkFinished(exchangeSinkInstanceHandle.get());
                        }
                        partitionToRemoteTaskMap.get(partitionId).forEach(RemoteTask::abort);
                        break;
                    case CANCELED:
                        log.debug("Task cancelled: %s", taskId);
                        break;
                    case ABORTED:
                        log.debug("Task aborted: %s", taskId);
                        break;
                    case FAILED:
                        ExecutionFailureInfo failureInfo = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).orElse(toFailure(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason")));
                        log.warn(failureInfo.toException(), "Task failed: %s", taskId);
                        ErrorCode errorCode = failureInfo.getErrorCode();
                        int taskRemainingAttempts = remainingAttemptsPerTask.getOrDefault(partitionId, maxRetryAttemptsPerTask);
                        if (remainingRetryAttemptsOverall > 0 && taskRemainingAttempts > 0 && (errorCode == null || errorCode.getType() != USER_ERROR)) {
                            remainingRetryAttemptsOverall--;
                            remainingAttemptsPerTask.put(partitionId, taskRemainingAttempts - 1);
                            // update memory limits for next attempt
                            MemoryRequirements memoryLimits = partitionMemoryRequirements.get(partitionId);
                            verify(memoryLimits != null);
                            MemoryRequirements newMemoryLimits = partitionMemoryEstimator.getNextRetryMemoryRequirements(session, memoryLimits, errorCode);
                            partitionMemoryRequirements.put(partitionId, newMemoryLimits);
                            // reschedule
                            queuedPartitions.add(partitionId);
                            log.debug("Retrying partition %s for stage %s", partitionId, stage.getStageId());
                        } else {
                            failure = failureInfo.toException();
                        }
                        break;
                    default:
                        throw new IllegalArgumentException("Unexpected task state: " + state);
                }
            }
        }
        if (failure != null) {
            // must be called outside the lock
            fail(failure);
        }
        if (future != null && !future.isDone()) {
            future.set(null);
        }
    } catch (Throwable t) {
        fail(t);
    }
}
Also used : TaskId(io.trino.execution.TaskId) MemoryRequirements(io.trino.execution.scheduler.PartitionMemoryEstimator.MemoryRequirements) RemoteTask(io.trino.execution.RemoteTask) ExecutionFailureInfo(io.trino.execution.ExecutionFailureInfo) TrinoException(io.trino.spi.TrinoException) ErrorCode(io.trino.spi.ErrorCode) MoreFutures.asVoid(io.airlift.concurrent.MoreFutures.asVoid) TaskState(io.trino.execution.TaskState)

Example 4 with ErrorCode

use of io.trino.spi.ErrorCode in project trino by trinodb.

the class SqlQueryScheduler method createDistributedStagesScheduler.

private synchronized Optional<DistributedStagesScheduler> createDistributedStagesScheduler(int attempt) {
    verify(attempt == 0 || retryPolicy == RetryPolicy.QUERY, "unexpected attempt %s for retry policy %s", attempt, retryPolicy);
    if (queryStateMachine.isDone()) {
        return Optional.empty();
    }
    if (attempt > 0 && retryPolicy == RetryPolicy.QUERY) {
        dynamicFilterService.registerQueryRetry(queryStateMachine.getQueryId(), attempt);
    }
    DistributedStagesScheduler distributedStagesScheduler;
    switch(retryPolicy) {
        case TASK:
            ExchangeManager exchangeManager = exchangeManagerRegistry.getExchangeManager();
            distributedStagesScheduler = FaultTolerantDistributedStagesScheduler.create(queryStateMachine, stageManager, failureDetector, taskSourceFactory, taskDescriptorStorage, exchangeManager, nodePartitioningManager, coordinatorStagesScheduler.getTaskLifecycleListener(), maxTaskRetryAttemptsOverall, maxTaskRetryAttemptsPerTask, schedulerExecutor, schedulerStats, nodeAllocatorService, partitionMemoryEstimator);
            break;
        case QUERY:
        case NONE:
            distributedStagesScheduler = PipelinedDistributedStagesScheduler.create(queryStateMachine, schedulerStats, nodeScheduler, nodePartitioningManager, stageManager, coordinatorStagesScheduler, executionPolicy, failureDetector, schedulerExecutor, splitSourceFactory, splitBatchSize, dynamicFilterService, tableExecuteContextManager, retryPolicy, attempt);
            break;
        default:
            throw new IllegalArgumentException("Unexpected retry policy: " + retryPolicy);
    }
    this.distributedStagesScheduler.set(distributedStagesScheduler);
    distributedStagesScheduler.addStateChangeListener(state -> {
        if (queryStateMachine.getQueryState() == QueryState.STARTING && (state == DistributedStagesSchedulerState.RUNNING || state.isDone())) {
            queryStateMachine.transitionToRunning();
        }
        if (state.isDone() && !state.isFailure()) {
            stageManager.getDistributedStagesInTopologicalOrder().forEach(stage -> stageManager.get(stage.getStageId()).finish());
        }
        if (stageManager.getCoordinatorStagesInTopologicalOrder().isEmpty()) {
            // otherwise defer query transitioning to the coordinator stages
            if (state == DistributedStagesSchedulerState.FINISHED) {
                queryStateMachine.transitionToFinishing();
            } else if (state == DistributedStagesSchedulerState.CANCELED) {
                // output stage was canceled
                queryStateMachine.transitionToCanceled();
            }
        }
        if (state == DistributedStagesSchedulerState.FAILED) {
            StageFailureInfo stageFailureInfo = distributedStagesScheduler.getFailureCause().orElseGet(() -> new StageFailureInfo(toFailure(new VerifyException("distributedStagesScheduler failed but failure cause is not present")), Optional.empty()));
            ErrorCode errorCode = stageFailureInfo.getFailureInfo().getErrorCode();
            if (shouldRetry(errorCode)) {
                long delayInMillis = min(retryInitialDelay.toMillis() * ((long) pow(2, currentAttempt.get())), retryMaxDelay.toMillis());
                currentAttempt.incrementAndGet();
                scheduleRetryWithDelay(delayInMillis);
            } else {
                stageManager.getDistributedStagesInTopologicalOrder().forEach(stage -> {
                    if (stageFailureInfo.getFailedStageId().isPresent() && stageFailureInfo.getFailedStageId().get().equals(stage.getStageId())) {
                        stage.fail(stageFailureInfo.getFailureInfo().toException());
                    } else {
                        stage.abort();
                    }
                });
                queryStateMachine.transitionToFailed(stageFailureInfo.getFailureInfo().toException());
            }
        }
    });
    return Optional.of(distributedStagesScheduler);
}
Also used : ExchangeManager(io.trino.spi.exchange.ExchangeManager) VerifyException(com.google.common.base.VerifyException) ErrorCode(io.trino.spi.ErrorCode)

Example 5 with ErrorCode

use of io.trino.spi.ErrorCode in project trino by trinodb.

the class TestQueryStateInfoResource method setUp.

@BeforeClass
public void setUp() {
    server = TestingTrinoServer.create();
    server.installPlugin(new TpchPlugin());
    server.createCatalog("tpch", "tpch");
    client = new JettyHttpClient();
    Request request1 = preparePost().setUri(uriBuilderFrom(server.getBaseUrl()).replacePath("/v1/statement").build()).setBodyGenerator(createStaticBodyGenerator(LONG_LASTING_QUERY, UTF_8)).setHeader(TRINO_HEADERS.requestUser(), "user1").build();
    queryResults = client.execute(request1, createJsonResponseHandler(QUERY_RESULTS_JSON_CODEC));
    client.execute(prepareGet().setUri(queryResults.getNextUri()).build(), createJsonResponseHandler(QUERY_RESULTS_JSON_CODEC));
    Request request2 = preparePost().setUri(uriBuilderFrom(server.getBaseUrl()).replacePath("/v1/statement").build()).setBodyGenerator(createStaticBodyGenerator(LONG_LASTING_QUERY, UTF_8)).setHeader(TRINO_HEADERS.requestUser(), "user2").build();
    QueryResults queryResults2 = client.execute(request2, createJsonResponseHandler(jsonCodec(QueryResults.class)));
    client.execute(prepareGet().setUri(queryResults2.getNextUri()).build(), createJsonResponseHandler(QUERY_RESULTS_JSON_CODEC));
    // queries are started in the background, so they may not all be immediately visible
    long start = System.nanoTime();
    while (Duration.nanosSince(start).compareTo(new Duration(5, MINUTES)) < 0) {
        List<BasicQueryInfo> queryInfos = client.execute(prepareGet().setUri(uriBuilderFrom(server.getBaseUrl()).replacePath("/v1/query").build()).setHeader(TRINO_HEADERS.requestUser(), "unknown").build(), createJsonResponseHandler(listJsonCodec(BasicQueryInfo.class)));
        if (queryInfos.size() == 2) {
            if (queryInfos.stream().allMatch(info -> info.getState() == RUNNING)) {
                break;
            }
            List<ErrorCode> errorCodes = queryInfos.stream().filter(info -> info.getState() == FAILED).map(BasicQueryInfo::getErrorCode).collect(toImmutableList());
            if (!errorCodes.isEmpty()) {
                fail("setup queries failed with: " + errorCodes);
            }
        }
    }
}
Also used : TpchPlugin(io.trino.plugin.tpch.TpchPlugin) JettyHttpClient(io.airlift.http.client.jetty.JettyHttpClient) Request(io.airlift.http.client.Request) Duration(io.airlift.units.Duration) ErrorCode(io.trino.spi.ErrorCode) QueryResults(io.trino.client.QueryResults) BeforeClass(org.testng.annotations.BeforeClass)

Aggregations

ErrorCode (io.trino.spi.ErrorCode)9 ExecutionFailureInfo (io.trino.execution.ExecutionFailureInfo)3 BasicQueryInfo (io.trino.server.BasicQueryInfo)3 RemoteTask (io.trino.execution.RemoteTask)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 VerifyException (com.google.common.base.VerifyException)1 MoreFutures.asVoid (io.airlift.concurrent.MoreFutures.asVoid)1 Request (io.airlift.http.client.Request)1 JettyHttpClient (io.airlift.http.client.jetty.JettyHttpClient)1 Duration (io.airlift.units.Duration)1 FailureInfo (io.trino.client.FailureInfo)1 QueryError (io.trino.client.QueryError)1 QueryResults (io.trino.client.QueryResults)1 Failure (io.trino.execution.Failure)1 QueryOutputInfo (io.trino.execution.QueryExecution.QueryOutputInfo)1 QueryInfo (io.trino.execution.QueryInfo)1 QueryState (io.trino.execution.QueryState)1 QueryStats (io.trino.execution.QueryStats)1 TaskId (io.trino.execution.TaskId)1 TaskState (io.trino.execution.TaskState)1