Search in sources :

Example 16 with InternalNode

use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.

the class SimpleNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks, Optional<SqlStageExecution> stage) {
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeMap nodeMapSlice = this.nodeMap.get().get();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMapSlice, existingTasks);
    ResettableRandomizedIterator<InternalNode> randomCandidates = randomizedNodes(nodeMapSlice, ImmutableSet.of());
    Set<InternalNode> blockedExactNodes = new HashSet<>();
    boolean splitWaitingForAnyNode = false;
    // splitsToBeRedistributed becomes true only when splits go through locality-based assignment
    boolean splitsToBeRedistributed = false;
    Set<Split> remainingSplits = new HashSet<>();
    // Check if the current stage has a TableScanNode which is reading the table for the 2nd time or beyond
    if (stage.isPresent() && stage.get().getStateMachine().getConsumerScanNode() != null) {
        try {
            // if node exists, get the TableScanNode and cast it as consumer
            TableScanNode consumer = stage.get().getStateMachine().getConsumerScanNode();
            // all tables part of this stage
            Map<PlanNodeId, TableInfo> tables = stage.get().getStageInfo().getTables();
            QualifiedObjectName tableName;
            for (Map.Entry<PlanNodeId, TableInfo> entry : tables.entrySet()) {
                tableName = entry.getValue().getTableName();
                if (tableSplitAssignmentInfo.getReuseTableScanMappingIdSplitAssignmentMap().containsKey(consumer.getReuseTableScanMappingId())) {
                    // compare splitkey using equals and then assign nodes accordingly.
                    HashMap<SplitKey, InternalNode> splitKeyNodeAssignment = tableSplitAssignmentInfo.getSplitKeyNodeAssignment(consumer.getReuseTableScanMappingId());
                    Set<SplitKey> splitKeySet = splitKeyNodeAssignment.keySet();
                    assignment.putAll(createConsumerScanNodeAssignment(tableName, splits, splitKeySet, splitKeyNodeAssignment));
                    for (Map.Entry<InternalNode, Split> nodeAssignmentEntry : assignment.entries()) {
                        InternalNode node = nodeAssignmentEntry.getKey();
                        assignmentStats.addAssignedSplit(node);
                    }
                }
            }
            log.debug("Consumer:: Assignment size is " + assignment.size() + " ,Assignment is " + assignment + " ,Assignment Stats is " + assignmentStats);
        } catch (NotImplementedException e) {
            log.error("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
            throw new UnsupportedOperationException("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
        }
    } else {
        // optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain locality information
        if (optimizedLocalScheduling) {
            // should not hit for consumer case
            for (Split split : splits) {
                if (split.isRemotelyAccessible() && !split.getAddresses().isEmpty()) {
                    List<InternalNode> candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
                    Optional<InternalNode> chosenNode = candidateNodes.stream().filter(ownerNode -> assignmentStats.getTotalSplitCount(ownerNode) < maxSplitsPerNode).min(comparingInt(assignmentStats::getTotalSplitCount));
                    if (chosenNode.isPresent()) {
                        assignment.put(chosenNode.get(), split);
                        // check later
                        assignmentStats.addAssignedSplit(chosenNode.get());
                        splitsToBeRedistributed = true;
                        continue;
                    }
                }
                remainingSplits.add(split);
            }
        } else {
            remainingSplits = splits;
        }
        for (Split split : remainingSplits) {
            randomCandidates.reset();
            List<InternalNode> candidateNodes;
            if (!split.isRemotelyAccessible()) {
                candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
            } else {
                candidateNodes = selectNodes(minCandidates, randomCandidates);
            }
            if (candidateNodes.isEmpty()) {
                log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMapSlice.getNodesByHost().keys());
                throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
            }
            InternalNode chosenNode = null;
            int min = Integer.MAX_VALUE;
            for (InternalNode node : candidateNodes) {
                int totalSplitCount = assignmentStats.getTotalSplitCount(node);
                if (totalSplitCount < min && totalSplitCount < maxSplitsPerNode) {
                    chosenNode = node;
                    min = totalSplitCount;
                }
            }
            if (chosenNode == null) {
                // min is guaranteed to be MAX_VALUE at this line
                for (InternalNode node : candidateNodes) {
                    int totalSplitCount = assignmentStats.getQueuedSplitCountForStage(node);
                    if (totalSplitCount < min && totalSplitCount < maxPendingSplitsPerTask) {
                        chosenNode = node;
                        min = totalSplitCount;
                    }
                }
            }
            if (chosenNode != null) {
                assignment.put(chosenNode, split);
                assignmentStats.addAssignedSplit(chosenNode);
            } else {
                if (split.isRemotelyAccessible()) {
                    splitWaitingForAnyNode = true;
                } else // Exact node set won't matter, if a split is waiting for any node
                if (!splitWaitingForAnyNode) {
                    blockedExactNodes.addAll(candidateNodes);
                }
            }
        }
    }
    ListenableFuture<?> blocked;
    if (splitWaitingForAnyNode) {
        blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
    } else {
        blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
    }
    if (!stage.isPresent() || stage.get().getStateMachine().getConsumerScanNode() == null) {
        if (splitsToBeRedistributed) {
            // skip for consumer
            equateDistribution(assignment, assignmentStats, nodeMapSlice);
        }
    }
    // Check if the current stage has a TableScanNode which is reading the table for the 1st time
    if (stage.isPresent() && stage.get().getStateMachine().getProducerScanNode() != null) {
        // if node exists, get the TableScanNode and annotate it as producer
        saveProducerScanNodeAssignment(stage, assignment, assignmentStats);
    }
    // Check if its CTE node and its feeder
    if (stage.isPresent() && stage.get().getFragment().getFeederCTEId().isPresent()) {
        updateFeederNodeAndSplitCount(stage.get(), assignment);
    }
    return new SplitPlacementResult(blocked, assignment);
}
Also used : NodeScheduler.randomizedNodes(io.prestosql.execution.scheduler.NodeScheduler.randomizedNodes) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Logger(io.airlift.log.Logger) Supplier(com.google.common.base.Supplier) HashMap(java.util.HashMap) Split(io.prestosql.metadata.Split) NO_NODES_AVAILABLE(io.prestosql.spi.StandardErrorCode.NO_NODES_AVAILABLE) Multimap(com.google.common.collect.Multimap) AtomicReference(java.util.concurrent.atomic.AtomicReference) QualifiedObjectName(io.prestosql.spi.connector.QualifiedObjectName) InetAddress(java.net.InetAddress) HashSet(java.util.HashSet) HashMultimap(com.google.common.collect.HashMultimap) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Suppliers(com.google.common.base.Suppliers) NodeTaskMap(io.prestosql.execution.NodeTaskMap) InternalNodeManager(io.prestosql.metadata.InternalNodeManager) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) TableInfo(io.prestosql.execution.TableInfo) PrestoException(io.prestosql.spi.PrestoException) Comparator.comparingInt(java.util.Comparator.comparingInt) ImmutableSet(com.google.common.collect.ImmutableSet) NotImplementedException(sun.reflect.generics.reflectiveObjects.NotImplementedException) HostAddress(io.prestosql.spi.HostAddress) Iterator(java.util.Iterator) NodeScheduler.calculateLowWatermark(io.prestosql.execution.scheduler.NodeScheduler.calculateLowWatermark) IndexedPriorityQueue(io.prestosql.execution.resourcegroups.IndexedPriorityQueue) InternalNode(io.prestosql.metadata.InternalNode) Collection(java.util.Collection) TableScanNode(io.prestosql.spi.plan.TableScanNode) Set(java.util.Set) NodeScheduler.toWhenHasSplitQueueSpaceFuture(io.prestosql.execution.scheduler.NodeScheduler.toWhenHasSplitQueueSpaceFuture) UnknownHostException(java.net.UnknownHostException) Collectors(java.util.stream.Collectors) SetMultimap(com.google.common.collect.SetMultimap) NodeScheduler.selectNodes(io.prestosql.execution.scheduler.NodeScheduler.selectNodes) List(java.util.List) SplitKey(io.prestosql.execution.SplitKey) GENERIC_INTERNAL_ERROR(io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) Optional(java.util.Optional) NodeScheduler.selectDistributionNodes(io.prestosql.execution.scheduler.NodeScheduler.selectDistributionNodes) VisibleForTesting(com.google.common.annotations.VisibleForTesting) SqlStageExecution(io.prestosql.execution.SqlStageExecution) NodeScheduler.selectExactNodes(io.prestosql.execution.scheduler.NodeScheduler.selectExactNodes) RemoteTask(io.prestosql.execution.RemoteTask) SplitKey(io.prestosql.execution.SplitKey) NotImplementedException(sun.reflect.generics.reflectiveObjects.NotImplementedException) PrestoException(io.prestosql.spi.PrestoException) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) TableInfo(io.prestosql.execution.TableInfo) HashSet(java.util.HashSet) QualifiedObjectName(io.prestosql.spi.connector.QualifiedObjectName) TableScanNode(io.prestosql.spi.plan.TableScanNode) InternalNode(io.prestosql.metadata.InternalNode) Split(io.prestosql.metadata.Split) HashMap(java.util.HashMap) Map(java.util.Map) NodeTaskMap(io.prestosql.execution.NodeTaskMap)

Example 17 with InternalNode

use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.

the class SplitCacheAwareNodeSelector method computeAssignments.

@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks, Optional<SqlStageExecution> stage) {
    Multimap<InternalNode, Split> assignment = HashMultimap.create();
    NodeMap nodeMapSlice = this.nodeMap.get().get();
    Map<CatalogName, Map<String, InternalNode>> activeNodesByCatalog = new HashMap<>();
    NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMapSlice, existingTasks);
    Set<Split> uncacheableSplits = new HashSet<>();
    Set<Split> newCacheableSplits = new HashSet<>();
    SplitCacheMap splitCacheMap = SplitCacheMap.getInstance();
    for (Split split : splits) {
        Optional<String> assignedNodeId = Optional.empty();
        SplitKey splitKey = createSplitKey(split);
        if (splitKey != null) {
            assignedNodeId = splitCacheMap.getCachedNodeId(splitKey);
        }
        if (!split.getConnectorSplit().isCacheable() || splitKey == null) {
            // uncacheable splits will be scheduled using default node selector
            uncacheableSplits.add(split);
            continue;
        }
        Map<String, InternalNode> activeNodes = activeNodesByCatalog.computeIfAbsent(split.getCatalogName(), catalogName -> nodeManager.getActiveConnectorNodes(catalogName).stream().collect(Collectors.toMap(InternalNode::getNodeIdentifier, Function.identity())));
        InternalNode assignedNode = assignedNodeId.map(activeNodes::get).orElse(null);
        // check if a node has been assigned and ensure it is still active before scheduling
        if (assignedNode != null) {
            // split has been previously assigned to a node
            // assign the split to the same node as before
            assignment.put(assignedNode, split);
            assignmentStats.addAssignedSplit(assignedNode);
        } else {
            // splits that have not be previously cached or the assigned node is now inactive
            newCacheableSplits.add(split);
        }
    }
    log.info("%d out of %d splits already cached. %d new splits to be cached. %d splits cannot be cached.", assignment.size(), splits.size(), newCacheableSplits.size(), uncacheableSplits.size());
    Set<Split> unassignedSplits = new HashSet<>();
    unassignedSplits.addAll(newCacheableSplits);
    unassignedSplits.addAll(uncacheableSplits);
    // Compute split assignments for splits that cannot be cached, newly cacheable, and already cached but cached worker is inactive now.
    SplitPlacementResult defaultSplitPlacementResult = defaultNodeSelector.computeAssignments(unassignedSplits, existingTasks, stage);
    defaultSplitPlacementResult.getAssignments().forEach(((internalNode, split) -> {
        // Set or Update cached node id only if split is cacheable
        if (newCacheableSplits.contains(split)) {
            SplitKey splitKey = createSplitKey(split);
            if (splitKey != null) {
                splitCacheMap.addCachedNode(splitKey, internalNode.getNodeIdentifier());
            }
        }
        assignmentStats.addAssignedSplit(internalNode);
    }));
    assignment.putAll(defaultSplitPlacementResult.getAssignments());
    // Check if its CTE node and its feeder
    if (stage.isPresent() && stage.get().getFragment().getFeederCTEId().isPresent()) {
        updateFeederNodeAndSplitCount(stage.get(), assignment);
    }
    return new SplitPlacementResult(defaultSplitPlacementResult.getBlocked(), assignment);
}
Also used : SplitCacheMap(io.prestosql.execution.SplitCacheMap) NodeScheduler.randomizedNodes(io.prestosql.execution.scheduler.NodeScheduler.randomizedNodes) Logger(io.airlift.log.Logger) Supplier(com.google.common.base.Supplier) HashMap(java.util.HashMap) Split(io.prestosql.metadata.Split) Multimap(com.google.common.collect.Multimap) AtomicReference(java.util.concurrent.atomic.AtomicReference) Function(java.util.function.Function) SplitCacheMap(io.prestosql.execution.SplitCacheMap) HashSet(java.util.HashSet) HashMultimap(com.google.common.collect.HashMultimap) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Suppliers(com.google.common.base.Suppliers) NodeTaskMap(io.prestosql.execution.NodeTaskMap) InternalNodeManager(io.prestosql.metadata.InternalNodeManager) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) InternalNode(io.prestosql.metadata.InternalNode) CatalogName(io.prestosql.spi.connector.CatalogName) Set(java.util.Set) Collectors(java.util.stream.Collectors) NodeScheduler.selectNodes(io.prestosql.execution.scheduler.NodeScheduler.selectNodes) List(java.util.List) SplitKey(io.prestosql.execution.SplitKey) Optional(java.util.Optional) NodeScheduler.selectDistributionNodes(io.prestosql.execution.scheduler.NodeScheduler.selectDistributionNodes) SqlStageExecution(io.prestosql.execution.SqlStageExecution) RemoteTask(io.prestosql.execution.RemoteTask) SplitKey(io.prestosql.execution.SplitKey) HashMap(java.util.HashMap) CatalogName(io.prestosql.spi.connector.CatalogName) InternalNode(io.prestosql.metadata.InternalNode) Split(io.prestosql.metadata.Split) HashMap(java.util.HashMap) SplitCacheMap(io.prestosql.execution.SplitCacheMap) Map(java.util.Map) NodeTaskMap(io.prestosql.execution.NodeTaskMap) HashSet(java.util.HashSet)

Example 18 with InternalNode

use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.

the class TableSplitAssignmentInfo method setPerTablesplitKeyNodeAssignment.

/**
 * Store the inverted assignment information [Split-Node mapping] for a given reuseTableScanMappingId number
 * @param qualifiedTableName name of the table which is as a producer(reads data from disk for the first time)
 * @param reuseTableScanMappingId unique identifier for producer-consumer pair for a reused table
 * @param assignmentInformation node-split assignment multimap created as part of the stage that processes this table
 * NOTE: Works only with Hive data as other connectors don't support SplitKey currently
 */
private void setPerTablesplitKeyNodeAssignment(QualifiedObjectName qualifiedTableName, UUID reuseTableScanMappingId, Multimap<InternalNode, Split> assignmentInformation) {
    String catalog = qualifiedTableName.getCatalogName();
    String schema = qualifiedTableName.getSchemaName();
    String table = qualifiedTableName.getObjectName();
    HashMap<SplitKey, InternalNode> splitKeyNodeAssignment;
    try {
        splitKeyNodeAssignment = perTableReuseTableScanMappingIdSplitKeyNodeAssignment.get(reuseTableScanMappingId);
        if (splitKeyNodeAssignment == null) {
            splitKeyNodeAssignment = new HashMap<>();
        }
        for (InternalNode node : assignmentInformation.keySet()) {
            Collection<Split> assigmentSplits = assignmentInformation.get(node);
            for (Split assigmentSplit : assigmentSplits) {
                if (assigmentSplit.getConnectorSplit().getSplitCount() > 1) {
                    for (Split unwrappedSplit : assigmentSplit.getSplits()) {
                        SplitKey splitKey = new SplitKey(unwrappedSplit, catalog, schema, table);
                        splitKeyNodeAssignment.put(splitKey, node);
                    }
                } else {
                    SplitKey splitKey = new SplitKey(assigmentSplit, catalog, schema, table);
                    splitKeyNodeAssignment.put(splitKey, node);
                }
            }
        }
        perTableReuseTableScanMappingIdSplitKeyNodeAssignment.put(reuseTableScanMappingId, splitKeyNodeAssignment);
    } catch (NotImplementedException e) {
        log.error("Unsupported split type: " + e);
        throw new UnsupportedOperationException("Unsupported split type: " + e);
    }
}
Also used : SplitKey(io.prestosql.execution.SplitKey) NotImplementedException(sun.reflect.generics.reflectiveObjects.NotImplementedException) InternalNode(io.prestosql.metadata.InternalNode) Split(io.prestosql.metadata.Split)

Example 19 with InternalNode

use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.

the class TopologyAwareNodeSelector method bestNodeSplitCount.

@Nullable
private InternalNode bestNodeSplitCount(Iterator<InternalNode> candidates, int minCandidatesWhenFull, int maxPendingSplitsPerTask, NodeAssignmentStats assignmentStats) {
    InternalNode bestQueueNotFull = null;
    int min = Integer.MAX_VALUE;
    int fullCandidatesConsidered = 0;
    while (candidates.hasNext() && (fullCandidatesConsidered < minCandidatesWhenFull || bestQueueNotFull == null)) {
        InternalNode node = candidates.next();
        if (assignmentStats.getTotalSplitCount(node) < maxSplitsPerNode) {
            return node;
        }
        fullCandidatesConsidered++;
        int totalSplitCount = assignmentStats.getQueuedSplitCountForStage(node);
        if (totalSplitCount < min && totalSplitCount < maxPendingSplitsPerTask) {
            min = totalSplitCount;
            bestQueueNotFull = node;
        }
    }
    return bestQueueNotFull;
}
Also used : InternalNode(io.prestosql.metadata.InternalNode) Nullable(javax.annotation.Nullable)

Example 20 with InternalNode

use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.

the class DynamicLifespanScheduler method schedule.

@Override
public SettableFuture<?> schedule(SourceScheduler scheduler) {
    // Return a new future even if newDriverGroupReady has not finished.
    // Returning the same SettableFuture instance could lead to ListenableFuture retaining too many listener objects.
    checkState(initialScheduled);
    List<Lifespan> tmpRecentlyCompletedDriverGroups;
    synchronized (this) {
        tmpRecentlyCompletedDriverGroups = ImmutableList.copyOf(this.recentlyCompletedDriverGroups);
        this.recentlyCompletedDriverGroups.clear();
        newDriverGroupReady = SettableFuture.create();
    }
    for (Lifespan driverGroup : tmpRecentlyCompletedDriverGroups) {
        if (!driverGroups.hasNext()) {
            break;
        }
        int driverGroupId = driverGroups.nextInt();
        InternalNode nodeForCompletedDriverGroup = bucketNodeMap.getAssignedNode(driverGroup.getId()).orElseThrow(IllegalStateException::new);
        bucketNodeMap.assignBucketToNode(driverGroupId, nodeForCompletedDriverGroup);
        scheduler.startLifespan(Lifespan.driverGroup(driverGroupId), partitionHandles.get(driverGroupId));
    }
    if (!driverGroups.hasNext()) {
        scheduler.noMoreLifespans();
    }
    return newDriverGroupReady;
}
Also used : InternalNode(io.prestosql.metadata.InternalNode) Lifespan(io.prestosql.execution.Lifespan)

Aggregations

InternalNode (io.prestosql.metadata.InternalNode)61 Split (io.prestosql.metadata.Split)33 ConnectorSplit (io.prestosql.spi.connector.ConnectorSplit)23 Test (org.testng.annotations.Test)22 TestingSplit (io.prestosql.testing.TestingSplit)20 HashSet (java.util.HashSet)17 MockSplit (io.prestosql.MockSplit)16 PlanNodeId (io.prestosql.spi.plan.PlanNodeId)16 ImmutableList (com.google.common.collect.ImmutableList)15 HashMap (java.util.HashMap)15 RemoteTask (io.prestosql.execution.RemoteTask)14 LinkedHashSet (java.util.LinkedHashSet)14 NodeTaskMap (io.prestosql.execution.NodeTaskMap)13 ArrayList (java.util.ArrayList)12 Map (java.util.Map)12 MockRemoteTaskFactory (io.prestosql.execution.MockRemoteTaskFactory)11 SqlStageExecution (io.prestosql.execution.SqlStageExecution)10 ImmutableSet (com.google.common.collect.ImmutableSet)9 TaskId (io.prestosql.execution.TaskId)9 PrestoException (io.prestosql.spi.PrestoException)9