Search in sources :

Example 1 with ModularHashingNodeProvider

use of com.facebook.presto.execution.scheduler.ModularHashingNodeProvider in project presto by prestodb.

the class SimpleNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeMap nodeMap = this.nodeMap.get().get();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
    List<InternalNode> eligibleNodes = getEligibleNodes(maxTasksPerStage, nodeMap, existingTasks);
    NodeSelection randomNodeSelection = new RandomNodeSelection(eligibleNodes, minCandidates);
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    NodeProvider nodeProvider = nodeMap.getActiveNodeProvider(nodeSelectionHashStrategy);
    OptionalInt preferredNodeCount = OptionalInt.empty();
    for (Split split : splits) {
        List<InternalNode> candidateNodes;
        switch(split.getNodeSelectionStrategy()) {
            case HARD_AFFINITY:
                candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
                preferredNodeCount = OptionalInt.of(candidateNodes.size());
                break;
            case SOFT_AFFINITY:
                // Using all nodes for soft affinity scheduling with modular hashing because otherwise temporarily down nodes would trigger too much rehashing
                if (nodeSelectionHashStrategy == MODULAR_HASHING) {
                    nodeProvider = new ModularHashingNodeProvider(nodeMap.getAllNodes());
                }
                candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
                preferredNodeCount = OptionalInt.of(candidateNodes.size());
                candidateNodes = ImmutableList.<InternalNode>builder().addAll(candidateNodes).addAll(randomNodeSelection.pickNodes(split)).build();
                break;
            case NO_PREFERENCE:
                candidateNodes = randomNodeSelection.pickNodes(split);
                break;
            default:
                throw new PrestoException(NODE_SELECTION_NOT_SUPPORTED, format("Unsupported node selection strategy %s", split.getNodeSelectionStrategy()));
        }
        if (candidateNodes.isEmpty()) {
            log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
            throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
        }
        SplitWeight splitWeight = split.getSplitWeight();
        Optional<InternalNodeInfo> chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
        if (!chosenNodeInfo.isPresent()) {
            chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
        }
        if (chosenNodeInfo.isPresent()) {
            split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(chosenNodeInfo.get().isCacheable()));
            InternalNode chosenNode = chosenNodeInfo.get().getInternalNode();
            assignment.put(chosenNode, split);
            assignmentStats.addAssignedSplit(chosenNode, splitWeight);
        } else {
            if (split.getNodeSelectionStrategy() != HARD_AFFINITY) {
                splitWaitingForAnyNode = true;
            } else // Exact node set won't matter, if a split is waiting for any node
            if (!splitWaitingForAnyNode) {
                blockedExactNodes.addAll(candidateNodes);
            }
        }
    }
    ListenableFuture<?> blocked;
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : NodeAssignmentStats(com.facebook.presto.execution.scheduler.NodeAssignmentStats) InternalNodeInfo(com.facebook.presto.execution.scheduler.InternalNodeInfo) PrestoException(com.facebook.presto.spi.PrestoException) NodeProvider(com.facebook.presto.spi.NodeProvider) ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) OptionalInt(java.util.OptionalInt) ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) SplitWeight(com.facebook.presto.spi.SplitWeight) SplitContext(com.facebook.presto.spi.SplitContext) BucketNodeMap(com.facebook.presto.execution.scheduler.BucketNodeMap) NodeMap(com.facebook.presto.execution.scheduler.NodeMap) InternalNode(com.facebook.presto.metadata.InternalNode) Split(com.facebook.presto.metadata.Split) SplitPlacementResult(com.facebook.presto.execution.scheduler.SplitPlacementResult) Sets.newHashSet(com.google.common.collect.Sets.newHashSet) HashSet(java.util.HashSet)

Example 2 with ModularHashingNodeProvider

use of com.facebook.presto.execution.scheduler.ModularHashingNodeProvider in project presto by prestodb.

the class TestNodeScheduler method testTopologyAwareScheduling.

@Test(timeOut = 60 * 1000)
public void testTopologyAwareScheduling() throws Exception {
    TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
    NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
    InMemoryNodeManager nodeManager = new InMemoryNodeManager();
    ImmutableList.Builder<InternalNode> nodeBuilder = ImmutableList.builder();
    nodeBuilder.add(new InternalNode("node1", URI.create("http://host1.rack1:11"), NodeVersion.UNKNOWN, false));
    nodeBuilder.add(new InternalNode("node2", URI.create("http://host2.rack1:12"), NodeVersion.UNKNOWN, false));
    nodeBuilder.add(new InternalNode("node3", URI.create("http://host3.rack2:13"), NodeVersion.UNKNOWN, false));
    List<InternalNode> nodes = nodeBuilder.build();
    nodeManager.addNode(CONNECTOR_ID, nodes);
    // contents of taskMap indicate the node-task map for the current stage
    Map<InternalNode, RemoteTask> taskMap = new HashMap<>();
    NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setMaxSplitsPerNode(25).setIncludeCoordinator(false).setNetworkTopology("test").setMaxPendingSplitsPerTask(20);
    TestNetworkTopology topology = new TestNetworkTopology();
    NetworkLocationCache locationCache = new NetworkLocationCache(topology) {

        @Override
        public NetworkLocation get(HostAddress host) {
            // Bypass the cache for workers, since we only look them up once and they would all be unresolved otherwise
            if (host.getHostText().startsWith("host")) {
                return topology.locate(host);
            } else {
                return super.get(host);
            }
        }
    };
    NodeScheduler nodeScheduler = new NodeScheduler(locationCache, topology, nodeManager, new NodeSelectionStats(), nodeSchedulerConfig, nodeTaskMap, new Duration(5, SECONDS), new ThrowingNodeTtlFetcherManager(), new NoOpQueryManager(), new SimpleTtlNodeSelectorConfig());
    NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID);
    // Fill up the nodes with non-local data
    ImmutableSet.Builder<Split> nonRackLocalBuilder = ImmutableSet.builder();
    for (int i = 0; i < (25 + 11) * 3; i++) {
        nonRackLocalBuilder.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("data.other_rack", 1))));
    }
    Set<Split> nonRackLocalSplits = nonRackLocalBuilder.build();
    Multimap<InternalNode, Split> assignments = nodeSelector.computeAssignments(nonRackLocalSplits, ImmutableList.copyOf(taskMap.values())).getAssignments();
    MockRemoteTaskFactory remoteTaskFactory = new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor);
    int task = 0;
    for (InternalNode node : assignments.keySet()) {
        TaskId taskId = new TaskId("test", 1, 0, task);
        task++;
        MockRemoteTaskFactory.MockRemoteTask remoteTask = remoteTaskFactory.createTableScanTask(taskId, node, ImmutableList.copyOf(assignments.get(node)), nodeTaskMap.createTaskStatsTracker(node, taskId));
        remoteTask.startSplits(25);
        nodeTaskMap.addTask(node, remoteTask);
        taskMap.put(node, remoteTask);
    }
    // Continue assigning to fill up part of the queue
    nonRackLocalSplits = Sets.difference(nonRackLocalSplits, new HashSet<>(assignments.values()));
    assignments = nodeSelector.computeAssignments(nonRackLocalSplits, ImmutableList.copyOf(taskMap.values())).getAssignments();
    for (InternalNode node : assignments.keySet()) {
        RemoteTask remoteTask = taskMap.get(node);
        remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
    }
    nonRackLocalSplits = Sets.difference(nonRackLocalSplits, new HashSet<>(assignments.values()));
    // Check that 3 of the splits were rejected, since they're non-local
    assertEquals(nonRackLocalSplits.size(), 3);
    // Assign rack-local splits
    ImmutableSet.Builder<Split> rackLocalSplits = ImmutableSet.builder();
    HostAddress dataHost1 = HostAddress.fromParts("data.rack1", 1);
    HostAddress dataHost2 = HostAddress.fromParts("data.rack2", 1);
    for (int i = 0; i < 6 * 2; i++) {
        rackLocalSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(dataHost1)));
    }
    for (int i = 0; i < 6; i++) {
        rackLocalSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(dataHost2)));
    }
    assignments = nodeSelector.computeAssignments(rackLocalSplits.build(), ImmutableList.copyOf(taskMap.values())).getAssignments();
    for (InternalNode node : assignments.keySet()) {
        RemoteTask remoteTask = taskMap.get(node);
        remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
    }
    Set<Split> unassigned = Sets.difference(rackLocalSplits.build(), new HashSet<>(assignments.values()));
    // Compute the assignments a second time to account for the fact that some splits may not have been assigned due to asynchronous
    // loading of the NetworkLocationCache
    boolean cacheRefreshed = false;
    while (!cacheRefreshed) {
        cacheRefreshed = true;
        if (locationCache.get(dataHost1).equals(ROOT_LOCATION)) {
            cacheRefreshed = false;
        }
        if (locationCache.get(dataHost2).equals(ROOT_LOCATION)) {
            cacheRefreshed = false;
        }
        MILLISECONDS.sleep(10);
    }
    assignments = nodeSelector.computeAssignments(unassigned, ImmutableList.copyOf(taskMap.values())).getAssignments();
    for (InternalNode node : assignments.keySet()) {
        RemoteTask remoteTask = taskMap.get(node);
        remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
    }
    unassigned = Sets.difference(unassigned, new HashSet<>(assignments.values()));
    assertEquals(unassigned.size(), 3);
    int rack1 = 0;
    int rack2 = 0;
    for (Split split : unassigned) {
        String rack = topology.locate(split.getPreferredNodes(new ModularHashingNodeProvider(nodeSelector.getAllNodes())).get(0)).getSegments().get(0);
        switch(rack) {
            case "rack1":
                rack1++;
                break;
            case "rack2":
                rack2++;
                break;
            default:
                fail();
        }
    }
    assertEquals(rack1, 2);
    assertEquals(rack2, 1);
    // Assign local splits
    ImmutableSet.Builder<Split> localSplits = ImmutableSet.builder();
    localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host1.rack1", 1))));
    localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host2.rack1", 1))));
    localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host3.rack2", 1))));
    assignments = nodeSelector.computeAssignments(localSplits.build(), ImmutableList.copyOf(taskMap.values())).getAssignments();
    assertEquals(assignments.size(), 3);
    assertEquals(assignments.keySet().size(), 3);
}
Also used : HashMap(java.util.HashMap) ImmutableList(com.google.common.collect.ImmutableList) NodeSchedulerConfig(com.facebook.presto.execution.scheduler.NodeSchedulerConfig) HostAddress(com.facebook.presto.spi.HostAddress) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) NodeScheduler(com.facebook.presto.execution.scheduler.NodeScheduler) TestingTransactionHandle(com.facebook.presto.testing.TestingTransactionHandle) SimpleTtlNodeSelectorConfig(com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelectorConfig) HashSet(java.util.HashSet) NetworkLocationCache(com.facebook.presto.execution.scheduler.NetworkLocationCache) Duration(io.airlift.units.Duration) ThrowingNodeTtlFetcherManager(com.facebook.presto.ttl.nodettlfetchermanagers.ThrowingNodeTtlFetcherManager) InMemoryNodeManager(com.facebook.presto.metadata.InMemoryNodeManager) NoOpQueryManager(com.facebook.presto.dispatcher.NoOpQueryManager) NodeSelectionStats(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelectionStats) ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) InternalNode(com.facebook.presto.metadata.InternalNode) NodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) Split(com.facebook.presto.metadata.Split) Test(org.testng.annotations.Test)

Example 3 with ModularHashingNodeProvider

use of com.facebook.presto.execution.scheduler.ModularHashingNodeProvider in project presto by prestodb.

the class TestNodeScheduler method testScheduleLocal.

@Test
public void testScheduleLocal() {
    Split split = new Split(CONNECTOR_ID, TestingTransactionHandle.create(), new TestSplitLocal());
    Set<Split> splits = ImmutableSet.of(split);
    Map.Entry<InternalNode, Split> assignment = Iterables.getOnlyElement(nodeSelector.computeAssignments(splits, ImmutableList.copyOf(taskMap.values())).getAssignments().entries());
    assertEquals(assignment.getKey().getHostAndPort(), split.getPreferredNodes(new ModularHashingNodeProvider(nodeSelector.getAllNodes())).get(0));
    assertEquals(assignment.getValue(), split);
}
Also used : ModularHashingNodeProvider(com.facebook.presto.execution.scheduler.ModularHashingNodeProvider) InternalNode(com.facebook.presto.metadata.InternalNode) ConnectorSplit(com.facebook.presto.spi.ConnectorSplit) Split(com.facebook.presto.metadata.Split) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Test(org.testng.annotations.Test)

Aggregations

ModularHashingNodeProvider (com.facebook.presto.execution.scheduler.ModularHashingNodeProvider)3 InternalNode (com.facebook.presto.metadata.InternalNode)3 Split (com.facebook.presto.metadata.Split)3 ConnectorSplit (com.facebook.presto.spi.ConnectorSplit)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 Test (org.testng.annotations.Test)2 NoOpQueryManager (com.facebook.presto.dispatcher.NoOpQueryManager)1 BucketNodeMap (com.facebook.presto.execution.scheduler.BucketNodeMap)1 InternalNodeInfo (com.facebook.presto.execution.scheduler.InternalNodeInfo)1 NetworkLocationCache (com.facebook.presto.execution.scheduler.NetworkLocationCache)1 NodeAssignmentStats (com.facebook.presto.execution.scheduler.NodeAssignmentStats)1 NodeMap (com.facebook.presto.execution.scheduler.NodeMap)1 NodeScheduler (com.facebook.presto.execution.scheduler.NodeScheduler)1 NodeSchedulerConfig (com.facebook.presto.execution.scheduler.NodeSchedulerConfig)1 SplitPlacementResult (com.facebook.presto.execution.scheduler.SplitPlacementResult)1 NodeSelectionStats (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelectionStats)1 NodeSelector (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector)1 SimpleTtlNodeSelectorConfig (com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelectorConfig)1 InMemoryNodeManager (com.facebook.presto.metadata.InMemoryNodeManager)1