Search in sources :

Example 1 with SplitWeight

use of com.facebook.presto.spi.SplitWeight in project presto by prestodb.

the class NodeScheduler method selectDistributionNodes.

public static SplitPlacementResult selectDistributionNodes(NodeMap nodeMap, NodeTaskMap nodeTaskMap, long maxSplitsWeightPerNode, long maxPendingSplitsWeightPerTask, int maxUnacknowledgedSplitsPerTask, Set<Split> splits, List<RemoteTask> existingTasks, BucketNodeMap bucketNodeMap, NodeSelectionStats nodeSelectionStats) {
    Multimap<InternalNode, Split> assignments = HashMultimap.create();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    Set<InternalNode> blockedNodes = new HashSet<>();
    for (Split split : splits) {
        // node placement is forced by the bucket to node map
        InternalNode node = bucketNodeMap.getAssignedNode(split).get();
        boolean isCacheable = bucketNodeMap.isSplitCacheable(split);
        SplitWeight splitWeight = split.getSplitWeight();
        // if node is full, don't schedule now, which will push back on the scheduling of splits
        if (canAssignSplitToDistributionNode(assignmentStats, node, maxSplitsWeightPerNode, maxPendingSplitsWeightPerTask, maxUnacknowledgedSplitsPerTask, splitWeight)) {
            if (isCacheable) {
                split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(true));
                nodeSelectionStats.incrementBucketedPreferredNodeSelectedCount();
            } else {
                nodeSelectionStats.incrementBucketedNonPreferredNodeSelectedCount();
            }
            assignments.put(node, split);
            assignmentStats.addAssignedSplit(node, splitWeight);
        } else {
            blockedNodes.add(node);
        }
    }
    ListenableFuture<?> blocked = toWhenHasSplitQueueSpaceFuture(blockedNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    return new SplitPlacementResult(blocked, ImmutableMultimap.copyOf(assignments));
}
Also used : SplitWeight(com.facebook.presto.spi.SplitWeight) SplitContext(com.facebook.presto.spi.SplitContext) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 2 with SplitWeight

use of com.facebook.presto.spi.SplitWeight in project presto by prestodb.

the class SimpleNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeMap nodeMap = this.nodeMap.get().get();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    List<InternalNode> eligibleNodes = getEligibleNodes(maxTasksPerStage, nodeMap, existingTasks);
    NodeSelection randomNodeSelection = new RandomNodeSelection(eligibleNodes, minCandidates);
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    NodeProvider nodeProvider = nodeMap.getActiveNodeProvider(nodeSelectionHashStrategy);
    OptionalInt preferredNodeCount = OptionalInt.empty();
    for (Split split : splits) {
        List<InternalNode> candidateNodes;
        switch(split.getNodeSelectionStrategy()) {
            case HARD_AFFINITY:
                candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
                preferredNodeCount = OptionalInt.of(candidateNodes.size());
                break;
            case SOFT_AFFINITY:
                // Using all nodes for soft affinity scheduling with modular hashing because otherwise temporarily down nodes would trigger too much rehashing
                if (nodeSelectionHashStrategy == MODULAR_HASHING) {
                    nodeProvider = new ModularHashingNodeProvider(nodeMap.getAllNodes());
                }
                candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
                preferredNodeCount = OptionalInt.of(candidateNodes.size());
                candidateNodes = ImmutableList.<InternalNode>builder().addAll(candidateNodes).addAll(randomNodeSelection.pickNodes(split)).build();
                break;
            case NO_PREFERENCE:
                candidateNodes = randomNodeSelection.pickNodes(split);
                break;
            default:
                throw new PrestoException(NODE_SELECTION_NOT_SUPPORTED, format("Unsupported node selection strategy %s", split.getNodeSelectionStrategy()));
        }
        if (candidateNodes.isEmpty()) {
            log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
            throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
        }
        SplitWeight splitWeight = split.getSplitWeight();
        Optional<InternalNodeInfo> chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
        if (!chosenNodeInfo.isPresent()) {
            chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
        }
        if (chosenNodeInfo.isPresent()) {
            split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(chosenNodeInfo.get().isCacheable()));
            InternalNode chosenNode = chosenNodeInfo.get().getInternalNode();
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, splitWeight);
        } else {
            if (split.getNodeSelectionStrategy() != HARD_AFFINITY) {
                splitWaitingForAnyNode = true;
            } else // Exact node set won't matter, if a split is waiting for any node
            if (!splitWaitingForAnyNode) {
                blockedExactNodes.addAll(candidateNodes);
            }
        }
    }
    ListenableFuture<?> blocked;
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : NodeAssignmentStats(com.facebook.presto.execution.scheduler.NodeAssignmentStats) InternalNodeInfo(com.facebook.presto.execution.scheduler.InternalNodeInfo) PrestoException(com.facebook.presto.spi.PrestoException) NodeProvider(com.facebook.presto.spi.NodeProvider) ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) OptionalInt(java.util.OptionalInt) ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) SplitWeight(com.facebook.presto.spi.SplitWeight) SplitContext(com.facebook.presto.spi.SplitContext) BucketNodeMap(com.facebook.presto.execution.scheduler.BucketNodeMap) NodeMap(com.facebook.presto.execution.scheduler.NodeMap) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult) Sets.newHashSet(com.google.common.collect.Sets.newHashSet) HashSet(java.util.HashSet)

Example 3 with SplitWeight

use of com.facebook.presto.spi.SplitWeight in project presto by prestodb.

the class TestNodeScheduler method testMoreSplitsAssignedWhenSplitsWeightsAreSmall.

@Test
public void testMoreSplitsAssignedWhenSplitsWeightsAreSmall() {
    int standardSplitsPerNode = nodeSchedulerConfig.getMaxSplitsPerNode();
    int standardPendingSplitsPerTask = nodeSchedulerConfig.getMaxPendingSplitsPerTask();
    int fullyLoadedStandardSplitCount = standardSplitsPerNode + standardPendingSplitsPerTask;
    long weightLimitPerNode = SplitWeight.rawValueForStandardSplitCount(standardSplitsPerNode);
    long weightLimitPendingPerTask = SplitWeight.rawValueForStandardSplitCount(standardPendingSplitsPerTask);
    long fullyLoadedStandardSplitWeight = weightLimitPerNode + weightLimitPendingPerTask;
    // Single worker node
    nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 1);
    InternalNode workerNode = nodeSelector.selectRandomNodes(1).get(0);
    MockRemoteTaskFactory remoteTaskFactory = new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor);
    TaskId taskId = new TaskId("test", 1, 0, 1);
    MockRemoteTaskFactory.MockRemoteTask task = remoteTaskFactory.createTableScanTask(taskId, workerNode, ImmutableList.of(), nodeTaskMap.createTaskStatsTracker(workerNode, taskId));
    TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
    ImmutableSet.Builder<Split> splitsBuilder = ImmutableSet.builderWithExpectedSize(fullyLoadedStandardSplitCount * 2);
    // Create 2x more splits than the standard split count limit, at 1/2 the standard weight
    SplitWeight halfWeight = SplitWeight.fromProportion(0.5);
    for (int i = 0; i < fullyLoadedStandardSplitCount * 2; i++) {
        splitsBuilder.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(halfWeight)));
    }
    Set<Split> splits = splitsBuilder.build();
    // Verify we arrived at the exact weight limit
    assertEquals(SplitWeight.rawValueSum(splits, Split::getSplitWeight), fullyLoadedStandardSplitWeight);
    // Node assignment limit met
    SplitPlacementResult result = nodeSelector.computeAssignments(splits, ImmutableList.of(task));
    assertEquals(result.getAssignments().get(workerNode).size(), standardSplitsPerNode * 2);
    assertEquals(SplitWeight.rawValueSum(result.getAssignments().get(workerNode), Split::getSplitWeight), weightLimitPerNode);
    // Mark all splits as running
    task.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), result.getAssignments().get(workerNode)).build());
    task.startSplits(result.getAssignments().get(workerNode).size());
    // Per task pending splits limit met
    Set<Split> remainingSplits = Sets.difference(splits, ImmutableSet.copyOf(result.getAssignments().get(workerNode)));
    SplitPlacementResult secondResults = nodeSelector.computeAssignments(remainingSplits, ImmutableList.of(task));
    assertEquals(secondResults.getAssignments().get(workerNode).size(), standardPendingSplitsPerTask * 2);
    assertEquals(SplitWeight.rawValueSum(secondResults.getAssignments().get(workerNode), Split::getSplitWeight), weightLimitPendingPerTask);
    task.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), secondResults.getAssignments().get(workerNode)).build());
    assertEquals(nodeTaskMap.getPartitionedSplitsOnNode(workerNode), // 2x fully loaded standard count, full weight limit reached
    PartitionedSplitsInfo.forSplitCountAndWeightSum(fullyLoadedStandardSplitCount * 2, fullyLoadedStandardSplitWeight));
    // No more splits assigned when full
    SplitPlacementResult resultWhenFull = nodeSelector.computeAssignments(ImmutableSet.of(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote())), ImmutableList.of(task));
    assertTrue(resultWhenFull.getAssignments().isEmpty());
}
Also used : PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) SplitWeight(com.facebook.presto.spi.SplitWeight) InternalNode(com.facebook.presto.metadata.InternalNode) TestingTransactionHandle(com.facebook.presto.testing.TestingTransactionHandle) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult) Test(org.testng.annotations.Test)

Example 4 with SplitWeight

use of com.facebook.presto.spi.SplitWeight in project presto by prestodb.

the class SimpleTtlNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    boolean isNodeSelectionStrategyNoPreference = splits.stream().allMatch(split -> split.getNodeSelectionStrategy() == NodeSelectionStrategy.NO_PREFERENCE);
    // Current NodeSelectionStrategy support is limited to NO_PREFERENCE
    if (!isNodeSelectionStrategyNoPreference) {
        return simpleNodeSelector.computeAssignments(splits, existingTasks);
    }
    ImmutableMultimap.Builder<InternalNode, Split> assignment = ImmutableMultimap.builder();
    NodeMap nodeMap = this.nodeMap.get().get();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    List<InternalNode> eligibleNodes = getEligibleNodes(maxTasksPerStage, nodeMap, existingTasks);
    NodeSelection randomNodeSelection = new RandomNodeSelection(eligibleNodes, minCandidates);
    boolean splitWaitingForAnyNode = false;
    OptionalInt preferredNodeCount = OptionalInt.empty();
    for (Split split : splits) {
        if (split.getNodeSelectionStrategy() != NodeSelectionStrategy.NO_PREFERENCE) {
            throw new PrestoException(NODE_SELECTION_NOT_SUPPORTED, format("Unsupported node selection strategy for TTL scheduling: %s", split.getNodeSelectionStrategy()));
        }
        List<InternalNode> candidateNodes = randomNodeSelection.pickNodes(split);
        if (candidateNodes.isEmpty()) {
            log.warn("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
            throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
        }
        SplitWeight splitWeight = split.getSplitWeight();
        Optional<InternalNodeInfo> chosenNodeInfo = simpleNodeSelector.chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
        if (!chosenNodeInfo.isPresent()) {
            chosenNodeInfo = simpleNodeSelector.chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
        }
        if (chosenNodeInfo.isPresent()) {
            split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(chosenNodeInfo.get().isCacheable()));
            InternalNode chosenNode = chosenNodeInfo.get().getInternalNode();
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, splitWeight);
        } else {
            splitWaitingForAnyNode = true;
        }
    }
    ListenableFuture<?> blocked = splitWaitingForAnyNode ? toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask)) : immediateFuture(null);
    return new SplitPlacementResult(blocked, assignment.build());
}
Also used : NodeAssignmentStats(com.facebook.presto.execution.scheduler.NodeAssignmentStats) InternalNodeInfo(com.facebook.presto.execution.scheduler.InternalNodeInfo) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) SplitWeight(com.facebook.presto.spi.SplitWeight) SplitContext(com.facebook.presto.spi.SplitContext) BucketNodeMap(com.facebook.presto.execution.scheduler.BucketNodeMap) NodeMap(com.facebook.presto.execution.scheduler.NodeMap) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult)

Example 5 with SplitWeight

use of com.facebook.presto.spi.SplitWeight in project presto by prestodb.

the class TopologyAwareNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    NodeMap nodeMap = this.nodeMap.get().get();
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    int[] topologicCounters = new int[topologicalSplitCounters.size()];
    Set<NetworkLocation> filledLocations = new HashSet<>();
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    NodeProvider nodeProvider = nodeMap.getActiveNodeProvider(nodeSelectionHashStrategy);
    for (Split split : splits) {
        SplitWeight splitWeight = split.getSplitWeight();
        if (split.getNodeSelectionStrategy() == HARD_AFFINITY) {
            List<InternalNode> candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
            if (candidateNodes.isEmpty()) {
                log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
                throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
            }
            InternalNode chosenNode = bestNodeSplitCount(splitWeight, candidateNodes.iterator(), minCandidates, maxPendingSplitsWeightPerTask, assignmentStats);
            if (chosenNode != null) {
                assignment.put(chosenNode, split);
                assignmentStats.addAssignedSplit(chosenNode, splitWeight);
            } else // Exact node set won't matter, if a split is waiting for any node
            if (!splitWaitingForAnyNode) {
                blockedExactNodes.addAll(candidateNodes);
            }
            continue;
        }
        InternalNode chosenNode = null;
        int depth = networkLocationSegmentNames.size();
        int chosenDepth = 0;
        Set<NetworkLocation> locations = new HashSet<>();
        for (HostAddress host : split.getPreferredNodes(nodeProvider)) {
            locations.add(networkLocationCache.get(host));
        }
        if (locations.isEmpty()) {
            // Add the root location
            locations.add(ROOT_LOCATION);
            depth = 0;
        }
        // Try each address at progressively shallower network locations
        for (int i = depth; i >= 0 && chosenNode == null; i--) {
            for (NetworkLocation location : locations) {
                // For example, locations which couldn't be located will be at the "root" location
                if (location.getSegments().size() < i) {
                    continue;
                }
                location = location.subLocation(0, i);
                if (filledLocations.contains(location)) {
                    continue;
                }
                Set<InternalNode> nodes = nodeMap.getActiveWorkersByNetworkPath().get(location);
                chosenNode = bestNodeSplitCount(splitWeight, new ResettableRandomizedIterator<>(nodes), minCandidates, calculateMaxPendingSplitsWeightPerTask(i, depth), assignmentStats);
                if (chosenNode != null) {
                    chosenDepth = i;
                    break;
                }
                filledLocations.add(location);
            }
        }
        if (chosenNode != null) {
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, splitWeight);
            topologicCounters[chosenDepth]++;
        } else {
            splitWaitingForAnyNode = true;
        }
    }
    for (int i = 0; i < topologicCounters.length; i++) {
        if (topologicCounters[i] > 0) {
            topologicalSplitCounters.get(i).update(topologicCounters[i]);
        }
    }
    ListenableFuture<?> blocked;
    long maxPendingForWildcardNetworkAffinity = calculateMaxPendingSplitsWeightPerTask(0, networkLocationSegmentNames.size());
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingForWildcardNetworkAffinity));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingForWildcardNetworkAffinity));
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : NodeAssignmentStats(com.facebook.presto.execution.scheduler.NodeAssignmentStats) PrestoException(com.facebook.presto.spi.PrestoException) NodeProvider(com.facebook.presto.spi.NodeProvider) HostAddress(com.facebook.presto.spi.HostAddress) NetworkLocation(com.facebook.presto.execution.scheduler.NetworkLocation) SplitWeight(com.facebook.presto.spi.SplitWeight) ResettableRandomizedIterator(com.facebook.presto.execution.scheduler.ResettableRandomizedIterator) NodeMap(com.facebook.presto.execution.scheduler.NodeMap) BucketNodeMap(com.facebook.presto.execution.scheduler.BucketNodeMap) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult) HashSet(java.util.HashSet)

Aggregations

InternalNode (com.facebook.presto.metadata.InternalNode)5 Split (com.facebook.presto.metadata.Split)5 SplitWeight (com.facebook.presto.spi.SplitWeight)5 SplitPlacementResult (com.facebook.presto.execution.scheduler.SplitPlacementResult)4 BucketNodeMap (com.facebook.presto.execution.scheduler.BucketNodeMap)3 NodeAssignmentStats (com.facebook.presto.execution.scheduler.NodeAssignmentStats)3 NodeMap (com.facebook.presto.execution.scheduler.NodeMap)3 PrestoException (com.facebook.presto.spi.PrestoException)3 SplitContext (com.facebook.presto.spi.SplitContext)3 HashSet (java.util.HashSet)3 InternalNodeInfo (com.facebook.presto.execution.scheduler.InternalNodeInfo)2 NodeProvider (com.facebook.presto.spi.NodeProvider)2 OptionalInt (java.util.OptionalInt)2 ModularHashingNodeProvider (com.facebook.presto.execution.scheduler.ModularHashingNodeProvider)1 NetworkLocation (com.facebook.presto.execution.scheduler.NetworkLocation)1 ResettableRandomizedIterator (com.facebook.presto.execution.scheduler.ResettableRandomizedIterator)1 ConnectorSplit (com.facebook.presto.spi.ConnectorSplit)1 HostAddress (com.facebook.presto.spi.HostAddress)1 PlanNodeId (com.facebook.presto.spi.plan.PlanNodeId)1 TestingTransactionHandle (com.facebook.presto.testing.TestingTransactionHandle)1