Search in sources :

Example 1 with NO_NODES_AVAILABLE

use of io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE in project trino by trinodb.

the class FullNodeCapableNodeAllocatorService method processFullNodePendingAcquires.

private void processFullNodePendingAcquires() {
    Map<PendingAcquire, InternalNode> assignedNodes = new IdentityHashMap<>();
    Map<PendingAcquire, RuntimeException> failures = new IdentityHashMap<>();
    synchronized (this) {
        Iterator<PendingAcquire> detachedIterator = detachedFullNodePendingAcquires.iterator();
        while (detachedIterator.hasNext()) {
            PendingAcquire pendingAcquire = detachedIterator.next();
            try {
                if (pendingAcquire.getFuture().isCancelled()) {
                    // discard cancelled detached pendingAcquire
                    detachedIterator.remove();
                    continue;
                }
                Candidates currentCandidates = selectCandidates(pendingAcquire.getNodeRequirements());
                if (currentCandidates.isEmpty()) {
                    throw new TrinoException(NO_NODES_AVAILABLE, "No nodes available to run query");
                }
                Optional<InternalNode> target = findTargetPendingFullNode(pendingAcquire.getQueryId(), currentCandidates);
                if (target.isEmpty()) {
                    // leave pendingAcquire as pending
                    continue;
                }
                // move pendingAcquire to fullNodePendingAcquires
                fullNodePendingAcquires.put(target.get(), pendingAcquire);
                fullNodesByQueryId.put(pendingAcquire.getQueryId(), target.get());
                detachedIterator.remove();
            } catch (RuntimeException e) {
                failures.put(pendingAcquire, e);
                detachedIterator.remove();
            }
        }
        Set<InternalNode> nodes = ImmutableSet.copyOf(fullNodePendingAcquires.keySet());
        for (InternalNode reservedNode : nodes) {
            PendingAcquire pendingAcquire = fullNodePendingAcquires.get(reservedNode);
            if (pendingAcquire.getFuture().isCancelled()) {
                // discard cancelled pendingAcquire with target node
                fullNodePendingAcquires.remove(reservedNode);
                verify(fullNodesByQueryId.remove(pendingAcquire.getQueryId(), reservedNode));
                continue;
            }
            try {
                Candidates currentCandidates = selectCandidates(pendingAcquire.getNodeRequirements());
                if (currentCandidates.isEmpty()) {
                    throw new TrinoException(NO_NODES_AVAILABLE, "No nodes available to run query");
                }
                if (sharedAllocatedMemory.getOrDefault(reservedNode, 0L) > 0 || allocatedFullNodes.contains(reservedNode)) {
                    // reserved node is still used - opportunistic check if maybe there is some other empty, not waited for node available
                    Optional<InternalNode> opportunisticNode = currentCandidates.getCandidates().stream().filter(node -> !fullNodePendingAcquires.containsKey(node)).filter(node -> !allocatedFullNodes.contains(node)).filter(node -> sharedAllocatedMemory.getOrDefault(node, 0L) == 0).findFirst();
                    if (opportunisticNode.isPresent()) {
                        fullNodePendingAcquires.remove(reservedNode);
                        verify(fullNodesByQueryId.remove(pendingAcquire.getQueryId(), reservedNode));
                        allocatedFullNodes.add(opportunisticNode.get());
                        verify(fullNodesByQueryId.put(pendingAcquire.getQueryId(), opportunisticNode.get()));
                        assignedNodes.put(pendingAcquire, opportunisticNode.get());
                    }
                    continue;
                }
                if (!currentCandidates.getCandidates().contains(reservedNode)) {
                    // current candidate is gone; move pendingAcquire to detached state
                    detachedFullNodePendingAcquires.add(pendingAcquire);
                    fullNodePendingAcquires.remove(reservedNode);
                    verify(fullNodesByQueryId.remove(pendingAcquire.getQueryId(), reservedNode));
                    // trigger one more round of processing immediately
                    wakeupProcessPendingAcquires();
                    continue;
                }
                // we are good acquiring reserved full node
                allocatedFullNodes.add(reservedNode);
                fullNodePendingAcquires.remove(reservedNode);
                assignedNodes.put(pendingAcquire, reservedNode);
            } catch (RuntimeException e) {
                failures.put(pendingAcquire, e);
                fullNodePendingAcquires.remove(reservedNode);
                fullNodesByQueryId.remove(pendingAcquire.getQueryId(), reservedNode);
            }
        }
    }
    // complete futures outside of synchronized section
    checkState(!Thread.holdsLock(this), "Cannot complete node futures under lock");
    assignedNodes.forEach((pendingAcquire, node) -> {
        SettableFuture<InternalNode> future = pendingAcquire.getFuture();
        future.set(node);
        if (future.isCancelled()) {
            releaseFullNode(node, pendingAcquire.getQueryId());
        }
    });
    failures.forEach((pendingAcquire, failure) -> {
        SettableFuture<InternalNode> future = pendingAcquire.getFuture();
        future.setException(failure);
    });
}
Also used : NodeState(io.trino.metadata.NodeState) QueryId(io.trino.spi.QueryId) FULL_NODE_MEMORY(io.trino.execution.scheduler.FallbackToFullNodePartitionMemoryEstimator.FULL_NODE_MEMORY) Thread.currentThread(java.lang.Thread.currentThread) SettableFuture(com.google.common.util.concurrent.SettableFuture) PreDestroy(javax.annotation.PreDestroy) CatalogName(io.trino.connector.CatalogName) HashMultimap(com.google.common.collect.HashMultimap) Map(java.util.Map) VisibleForTesting(org.assertj.core.util.VisibleForTesting) ImmutableSet(com.google.common.collect.ImmutableSet) IdentityHashMap(java.util.IdentityHashMap) GuardedBy(com.google.errorprone.annotations.concurrent.GuardedBy) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ThreadSafe(javax.annotation.concurrent.ThreadSafe) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) Preconditions.checkState(com.google.common.base.Preconditions.checkState) MoreExecutors.directExecutor(com.google.common.util.concurrent.MoreExecutors.directExecutor) ClusterMemoryManager(io.trino.memory.ClusterMemoryManager) List(java.util.List) PostConstruct(javax.annotation.PostConstruct) Optional(java.util.Optional) NO_NODES_AVAILABLE(io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE) Math.max(java.lang.Math.max) Predicate.not(java.util.function.Predicate.not) Session(io.trino.Session) InternalNodeManager(io.trino.metadata.InternalNodeManager) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Logger(io.airlift.log.Logger) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) HashMap(java.util.HashMap) Multimap(com.google.common.collect.Multimap) MemoryInfo(io.trino.memory.MemoryInfo) Deque(java.util.Deque) Supplier(java.util.function.Supplier) ConcurrentMap(java.util.concurrent.ConcurrentMap) Inject(javax.inject.Inject) HashSet(java.util.HashSet) Verify.verify(com.google.common.base.Verify.verify) Threads.daemonThreadsNamed(io.airlift.concurrent.Threads.daemonThreadsNamed) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Comparator.comparing(java.util.Comparator.comparing) LinkedList(java.util.LinkedList) Futures.immediateFuture(com.google.common.util.concurrent.Futures.immediateFuture) Iterator(java.util.Iterator) Semaphore(java.util.concurrent.Semaphore) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) InternalNode(io.trino.metadata.InternalNode) ArrayDeque(java.util.ArrayDeque) Comparator(java.util.Comparator) Futures.transform(com.google.common.util.concurrent.Futures.transform) IdentityHashMap(java.util.IdentityHashMap) TrinoException(io.trino.spi.TrinoException) InternalNode(io.trino.metadata.InternalNode)

Example 2 with NO_NODES_AVAILABLE

use of io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE in project trino by trinodb.

the class UniformNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeMap nodeMap = this.nodeMap.get().get();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    ResettableRandomizedIterator<InternalNode> randomCandidates = randomizedNodes(nodeMap, includeCoordinator, ImmutableSet.of());
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    // splitsToBeRedistributed becomes true only when splits go through locality-based assignment
    boolean splitsToBeRedistributed = false;
    Set<Split> remainingSplits = new HashSet<>();
    // optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain locality information
    if (optimizedLocalScheduling) {
        for (Split split : splits) {
            if (split.isRemotelyAccessible() && !split.getAddresses().isEmpty()) {
                List<InternalNode> candidateNodes = selectExactNodes(nodeMap, split.getAddresses(), includeCoordinator);
                Optional<InternalNode> chosenNode = candidateNodes.stream().filter(ownerNode -> assignmentStats.getTotalSplitsWeight(ownerNode) < maxSplitsWeightPerNode && assignmentStats.getUnacknowledgedSplitCountForStage(ownerNode) < maxUnacknowledgedSplitsPerTask).min(comparingLong(assignmentStats::getTotalSplitsWeight));
                if (chosenNode.isPresent()) {
                    assignment.put(chosenNode.get(), split);
                    assignmentStats.addAssignedSplit(chosenNode.get(), split.getSplitWeight());
                    splitsToBeRedistributed = true;
                    continue;
                }
            }
            remainingSplits.add(split);
        }
    } else {
        remainingSplits = splits;
    }
    for (Split split : remainingSplits) {
        randomCandidates.reset();
        List<InternalNode> candidateNodes;
        if (!split.isRemotelyAccessible()) {
            candidateNodes = selectExactNodes(nodeMap, split.getAddresses(), includeCoordinator);
        } else {
            candidateNodes = selectNodes(minCandidates, randomCandidates);
        }
        if (candidateNodes.isEmpty()) {
            log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getNodesByHost().keys());
            throw new TrinoException(NO_NODES_AVAILABLE, "No nodes available to run query");
        }
        InternalNode chosenNode = chooseNodeForSplit(assignmentStats, candidateNodes);
        if (chosenNode == null) {
            long minWeight = Long.MAX_VALUE;
            for (InternalNode node : candidateNodes) {
                long queuedWeight = assignmentStats.getQueuedSplitsWeightForStage(node);
                if (queuedWeight <= minWeight && queuedWeight < maxPendingSplitsWeightPerTask && assignmentStats.getUnacknowledgedSplitCountForStage(node) < maxUnacknowledgedSplitsPerTask) {
                    chosenNode = node;
                    minWeight = queuedWeight;
                }
            }
        }
        if (chosenNode != null) {
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, split.getSplitWeight());
        } else {
            if (split.isRemotelyAccessible()) {
                splitWaitingForAnyNode = true;
            } else // Exact node set won't matter, if a split is waiting for any node
            if (!splitWaitingForAnyNode) {
                blockedExactNodes.addAll(candidateNodes);
            }
        }
    }
    ListenableFuture<Void> blocked;
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    }
    if (splitsToBeRedistributed) {
        equateDistribution(assignment, assignmentStats, nodeMap, includeCoordinator);
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : InternalNodeManager(io.trino.metadata.InternalNodeManager) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) NodeTaskMap(io.trino.execution.NodeTaskMap) Logger(io.airlift.log.Logger) Multimap(com.google.common.collect.Multimap) AtomicReference(java.util.concurrent.atomic.AtomicReference) Supplier(java.util.function.Supplier) SplitWeight(io.trino.spi.SplitWeight) InetAddress(java.net.InetAddress) HashSet(java.util.HashSet) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) HashMultimap(com.google.common.collect.HashMultimap) NodeScheduler.randomizedNodes(io.trino.execution.scheduler.NodeScheduler.randomizedNodes) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) Suppliers(com.google.common.base.Suppliers) NodeScheduler.selectNodes(io.trino.execution.scheduler.NodeScheduler.selectNodes) Nullable(javax.annotation.Nullable) ImmutableSet(com.google.common.collect.ImmutableSet) SplitsBalancingPolicy(io.trino.execution.scheduler.NodeSchedulerConfig.SplitsBalancingPolicy) Iterator(java.util.Iterator) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) NodeScheduler.selectDistributionNodes(io.trino.execution.scheduler.NodeScheduler.selectDistributionNodes) RemoteTask(io.trino.execution.RemoteTask) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) UnknownHostException(java.net.UnknownHostException) SetMultimap(com.google.common.collect.SetMultimap) InternalNode(io.trino.metadata.InternalNode) List(java.util.List) NodeScheduler.selectExactNodes(io.trino.execution.scheduler.NodeScheduler.selectExactNodes) Comparator.comparingLong(java.util.Comparator.comparingLong) IndexedPriorityQueue(io.trino.execution.resourcegroups.IndexedPriorityQueue) Split(io.trino.metadata.Split) Optional(java.util.Optional) NodeScheduler.calculateLowWatermark(io.trino.execution.scheduler.NodeScheduler.calculateLowWatermark) NO_NODES_AVAILABLE(io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE) VisibleForTesting(com.google.common.annotations.VisibleForTesting) NodeScheduler.toWhenHasSplitQueueSpaceFuture(io.trino.execution.scheduler.NodeScheduler.toWhenHasSplitQueueSpaceFuture) NodeScheduler.getAllNodes(io.trino.execution.scheduler.NodeScheduler.getAllNodes) HostAddress(io.trino.spi.HostAddress) TrinoException(io.trino.spi.TrinoException) InternalNode(io.trino.metadata.InternalNode) Split(io.trino.metadata.Split) HashSet(java.util.HashSet)

Aggregations

HashMultimap (com.google.common.collect.HashMultimap)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Multimap (com.google.common.collect.Multimap)2 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)2 Logger (io.airlift.log.Logger)2 InternalNode (io.trino.metadata.InternalNode)2 InternalNodeManager (io.trino.metadata.InternalNodeManager)2 NO_NODES_AVAILABLE (io.trino.spi.StandardErrorCode.NO_NODES_AVAILABLE)2 TrinoException (io.trino.spi.TrinoException)2 Collection (java.util.Collection)2 HashSet (java.util.HashSet)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Set (java.util.Set)2 Supplier (java.util.function.Supplier)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1