Search in sources :

Example 56 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class TopologyAwareNodeSelector method bestNodeSplitCount.

@Nullable
private InternalNode bestNodeSplitCount(SplitWeight splitWeight, Iterator<InternalNode> candidates, int minCandidatesWhenFull, long maxPendingSplitsWeightPerTask, NodeAssignmentStats assignmentStats) {
    InternalNode bestQueueNotFull = null;
    long minWeight = Long.MAX_VALUE;
    int fullCandidatesConsidered = 0;
    while (candidates.hasNext() && (fullCandidatesConsidered < minCandidatesWhenFull || bestQueueNotFull == null)) {
        InternalNode node = candidates.next();
        if (assignmentStats.getUnacknowledgedSplitCountForStage(node) >= maxUnacknowledgedSplitsPerTask) {
            fullCandidatesConsidered++;
            continue;
        }
        if (canAssignSplitBasedOnWeight(assignmentStats.getTotalSplitsWeight(node), maxSplitsWeightPerNode, splitWeight)) {
            return node;
        }
        fullCandidatesConsidered++;
        long taskQueuedWeight = assignmentStats.getQueuedSplitsWeightForStage(node);
        if (taskQueuedWeight < minWeight && canAssignSplitBasedOnWeight(taskQueuedWeight, maxPendingSplitsWeightPerTask, splitWeight)) {
            minWeight = taskQueuedWeight;
            bestQueueNotFull = node;
        }
    }
    return bestQueueNotFull;
}
Also used : InternalNode(com.facebook.presto.metadata.InternalNode) Nullable(javax.annotation.Nullable)

Example 57 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class TopologyAwareNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    NodeMap nodeMap = this.nodeMap.get().get();
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    int[] topologicCounters = new int[topologicalSplitCounters.size()];
    Set<NetworkLocation> filledLocations = new HashSet<>();
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    NodeProvider nodeProvider = nodeMap.getActiveNodeProvider(nodeSelectionHashStrategy);
    for (Split split : splits) {
        SplitWeight splitWeight = split.getSplitWeight();
        if (split.getNodeSelectionStrategy() == HARD_AFFINITY) {
            List<InternalNode> candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
            if (candidateNodes.isEmpty()) {
                log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
                throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
            }
            InternalNode chosenNode = bestNodeSplitCount(splitWeight, candidateNodes.iterator(), minCandidates, maxPendingSplitsWeightPerTask, assignmentStats);
            if (chosenNode != null) {
                assignment.put(chosenNode, split);
                assignmentStats.addAssignedSplit(chosenNode, splitWeight);
            } else // Exact node set won't matter, if a split is waiting for any node
            if (!splitWaitingForAnyNode) {
                blockedExactNodes.addAll(candidateNodes);
            }
            continue;
        }
        InternalNode chosenNode = null;
        int depth = networkLocationSegmentNames.size();
        int chosenDepth = 0;
        Set<NetworkLocation> locations = new HashSet<>();
        for (HostAddress host : split.getPreferredNodes(nodeProvider)) {
            locations.add(networkLocationCache.get(host));
        }
        if (locations.isEmpty()) {
            // Add the root location
            locations.add(ROOT_LOCATION);
            depth = 0;
        }
        // Try each address at progressively shallower network locations
        for (int i = depth; i >= 0 && chosenNode == null; i--) {
            for (NetworkLocation location : locations) {
                // For example, locations which couldn't be located will be at the "root" location
                if (location.getSegments().size() < i) {
                    continue;
                }
                location = location.subLocation(0, i);
                if (filledLocations.contains(location)) {
                    continue;
                }
                Set<InternalNode> nodes = nodeMap.getActiveWorkersByNetworkPath().get(location);
                chosenNode = bestNodeSplitCount(splitWeight, new ResettableRandomizedIterator<>(nodes), minCandidates, calculateMaxPendingSplitsWeightPerTask(i, depth), assignmentStats);
                if (chosenNode != null) {
                    chosenDepth = i;
                    break;
                }
                filledLocations.add(location);
            }
        }
        if (chosenNode != null) {
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, splitWeight);
            topologicCounters[chosenDepth]++;
        } else {
            splitWaitingForAnyNode = true;
        }
    }
    for (int i = 0; i < topologicCounters.length; i++) {
        if (topologicCounters[i] > 0) {
            topologicalSplitCounters.get(i).update(topologicCounters[i]);
        }
    }
    ListenableFuture<?> blocked;
    long maxPendingForWildcardNetworkAffinity = calculateMaxPendingSplitsWeightPerTask(0, networkLocationSegmentNames.size());
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingForWildcardNetworkAffinity));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingForWildcardNetworkAffinity));
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : NodeAssignmentStats(com.facebook.presto.execution.scheduler.NodeAssignmentStats) PrestoException(com.facebook.presto.spi.PrestoException) NodeProvider(com.facebook.presto.spi.NodeProvider) HostAddress(com.facebook.presto.spi.HostAddress) NetworkLocation(com.facebook.presto.execution.scheduler.NetworkLocation) SplitWeight(com.facebook.presto.spi.SplitWeight) ResettableRandomizedIterator(com.facebook.presto.execution.scheduler.ResettableRandomizedIterator) NodeMap(com.facebook.presto.execution.scheduler.NodeMap) BucketNodeMap(com.facebook.presto.execution.scheduler.BucketNodeMap) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult) HashSet(java.util.HashSet)

Example 58 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class NodeScheduler method toWhenHasSplitQueueSpaceFuture.

public static ListenableFuture<?> toWhenHasSplitQueueSpaceFuture(Set<InternalNode> blockedNodes, List<RemoteTask> existingTasks, long weightSpaceThreshold) {
    if (blockedNodes.isEmpty()) {
        return immediateFuture(null);
    }
    Map<String, RemoteTask> nodeToTaskMap = new HashMap<>();
    for (RemoteTask task : existingTasks) {
        nodeToTaskMap.put(task.getNodeId(), task);
    }
    List<ListenableFuture<?>> blockedFutures = blockedNodes.stream().map(InternalNode::getNodeIdentifier).map(nodeToTaskMap::get).filter(Objects::nonNull).map(remoteTask -> remoteTask.whenSplitQueueHasSpace(weightSpaceThreshold)).collect(toImmutableList());
    if (blockedFutures.isEmpty()) {
        return immediateFuture(null);
    }
    return whenAnyCompleteCancelOthers(blockedFutures);
}
Also used : NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) CounterStat(com.facebook.airlift.stats.CounterStat) MoreFutures.whenAnyCompleteCancelOthers(com.facebook.airlift.concurrent.MoreFutures.whenAnyCompleteCancelOthers) Suppliers.memoizeWithExpiration(com.google.common.base.Suppliers.memoizeWithExpiration) NodeSelectionStats(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelectionStats) SystemSessionProperties.getResourceAwareSchedulingStrategy(com.facebook.presto.SystemSessionProperties.getResourceAwareSchedulingStrategy) Duration(io.airlift.units.Duration) ACTIVE(com.facebook.presto.spi.NodeState.ACTIVE) InetAddress(java.net.InetAddress) PreDestroy(javax.annotation.PreDestroy) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) InternalNodeManager(com.facebook.presto.metadata.InternalNodeManager) HashMultimap(com.google.common.collect.HashMultimap) SimpleTtlNodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelector) SplitContext(com.facebook.presto.spi.SplitContext) Map(java.util.Map) ImmutableSetMultimap(com.google.common.collect.ImmutableSetMultimap) SystemSessionProperties.getMaxUnacknowledgedSplitsPerTask(com.facebook.presto.SystemSessionProperties.getMaxUnacknowledgedSplitsPerTask) ImmutableMap(com.google.common.collect.ImmutableMap) SimpleNodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.SimpleNodeSelector) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) NetworkTopologyType(com.facebook.presto.execution.scheduler.NodeSchedulerConfig.NetworkTopologyType) HostAddress(com.facebook.presto.spi.HostAddress) Set(java.util.Set) NodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector) Math.ceil(java.lang.Math.ceil) Math.min(java.lang.Math.min) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Objects(java.util.Objects) List(java.util.List) TTL(com.facebook.presto.execution.scheduler.NodeSchedulerConfig.ResourceAwareSchedulingStrategy.TTL) NodeTtlFetcherManager(com.facebook.presto.ttl.nodettlfetchermanagers.NodeTtlFetcherManager) ConnectorId(com.facebook.presto.spi.ConnectorId) ResourceAwareSchedulingStrategy(com.facebook.presto.execution.scheduler.NodeSchedulerConfig.ResourceAwareSchedulingStrategy) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) SimpleTtlNodeSelectorConfig(com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelectorConfig) Supplier(com.google.common.base.Supplier) HashMap(java.util.HashMap) QueryManager(com.facebook.presto.execution.QueryManager) Multimap(com.google.common.collect.Multimap) Inject(javax.inject.Inject) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) LinkedHashSet(java.util.LinkedHashSet) Futures.immediateFuture(com.google.common.util.concurrent.Futures.immediateFuture) ALIVE(com.facebook.presto.metadata.InternalNode.NodeStatus.ALIVE) Session(com.facebook.presto.Session) UnknownHostException(java.net.UnknownHostException) InternalNode(com.facebook.presto.metadata.InternalNode) RemoteTask(com.facebook.presto.execution.RemoteTask) TopologyAwareNodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.TopologyAwareNodeSelector) Math.addExact(java.lang.Math.addExact) Split(com.facebook.presto.metadata.Split) SplitWeight(com.facebook.presto.spi.SplitWeight) SECONDS(java.util.concurrent.TimeUnit.SECONDS) HashMap(java.util.HashMap) Objects(java.util.Objects) RemoteTask(com.facebook.presto.execution.RemoteTask) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) InternalNode(com.facebook.presto.metadata.InternalNode)

Example 59 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class SimpleNodeSelector method chooseLeastBusyNode.

protected Optional<InternalNodeInfo> chooseLeastBusyNode(SplitWeight splitWeight, List<InternalNode> candidateNodes, ToLongFunction<InternalNode> splitWeightProvider, OptionalInt preferredNodeCount, long maxSplitsWeight, NodeAssignmentStats assignmentStats) {
    long minWeight = Long.MAX_VALUE;
    InternalNode chosenNode = null;
    for (int i = 0; i < candidateNodes.size(); i++) {
        InternalNode node = candidateNodes.get(i);
        if (node.getNodeStatus() == DEAD) {
            // Node is down. Do not schedule split. Skip it.
            if (preferredNodeCount.isPresent() && i < preferredNodeCount.getAsInt()) {
                nodeSelectionStats.incrementPreferredNonAliveNodeSkippedCount();
            }
            continue;
        }
        if (assignmentStats.getUnacknowledgedSplitCountForStage(node) >= maxUnacknowledgedSplitsPerTask) {
            continue;
        }
        long currentWeight = splitWeightProvider.applyAsLong(node);
        boolean canAssignToNode = canAssignSplitBasedOnWeight(currentWeight, maxSplitsWeight, splitWeight);
        // choose the preferred node first as long as they're not busy
        if (preferredNodeCount.isPresent() && i < preferredNodeCount.getAsInt() && canAssignToNode) {
            if (i == 0) {
                nodeSelectionStats.incrementPrimaryPreferredNodeSelectedCount();
            } else {
                nodeSelectionStats.incrementNonPrimaryPreferredNodeSelectedCount();
            }
            return Optional.of(new InternalNodeInfo(node, true));
        }
        // fallback to choosing the least busy nodes
        if (canAssignToNode && currentWeight < minWeight) {
            chosenNode = node;
            minWeight = currentWeight;
        }
    }
    if (chosenNode == null) {
        return Optional.empty();
    }
    nodeSelectionStats.incrementNonPreferredNodeSelectedCount();
    return Optional.of(new InternalNodeInfo(chosenNode, false));
}
Also used : InternalNodeInfo(com.facebook.presto.execution.scheduler.InternalNodeInfo) InternalNode(com.facebook.presto.metadata.InternalNode)

Example 60 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class ScaledWriterScheduler method scheduleTasks.

private List<RemoteTask> scheduleTasks(int count) {
    if (count == 0) {
        return ImmutableList.of();
    }
    List<InternalNode> nodes = nodeSelector.selectRandomNodes(count, scheduledNodes);
    checkCondition(!scheduledNodes.isEmpty() || !nodes.isEmpty(), NO_NODES_AVAILABLE, "No nodes available to run query");
    ImmutableList.Builder<RemoteTask> tasks = ImmutableList.builder();
    for (InternalNode node : nodes) {
        Optional<RemoteTask> remoteTask = stage.scheduleTask(node, scheduledNodes.size());
        remoteTask.ifPresent(task -> {
            tasks.add(task);
            scheduledNodes.add(node);
        });
    }
    return tasks.build();
}
Also used : ImmutableList(com.google.common.collect.ImmutableList) RemoteTask(com.facebook.presto.execution.RemoteTask) InternalNode(com.facebook.presto.metadata.InternalNode)

Aggregations

InternalNode (com.facebook.presto.metadata.InternalNode)74 Split (com.facebook.presto.metadata.Split)34 Test (org.testng.annotations.Test)34 ConnectorSplit (com.facebook.presto.spi.ConnectorSplit)25 HashSet (java.util.HashSet)24 ImmutableList (com.google.common.collect.ImmutableList)17 InMemoryNodeManager (com.facebook.presto.metadata.InMemoryNodeManager)14 SplitPlacementResult (com.facebook.presto.execution.scheduler.SplitPlacementResult)13 NodeSelectionStats (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelectionStats)12 NodeSelector (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector)12 ImmutableSet (com.google.common.collect.ImmutableSet)12 SimpleTtlNodeSelectorConfig (com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelectorConfig)11 ConnectorId (com.facebook.presto.spi.ConnectorId)11 TestingTransactionHandle (com.facebook.presto.testing.TestingTransactionHandle)11 Duration (io.airlift.units.Duration)11 URI (java.net.URI)11 Map (java.util.Map)11 RemoteTask (com.facebook.presto.execution.RemoteTask)10 NodeScheduler (com.facebook.presto.execution.scheduler.NodeScheduler)10 NodeSchedulerConfig (com.facebook.presto.execution.scheduler.NodeSchedulerConfig)9