use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.
the class SimpleNodeSelector method computeAssignments.
@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks, Optional<SqlStageExecution> stage) {
Multimap<InternalNode, Split> assignment = HashMultimap.create();
NodeMap nodeMapSlice = this.nodeMap.get().get();
NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMapSlice, existingTasks);
ResettableRandomizedIterator<InternalNode> randomCandidates = randomizedNodes(nodeMapSlice, ImmutableSet.of());
Set<InternalNode> blockedExactNodes = new HashSet<>();
boolean splitWaitingForAnyNode = false;
// splitsToBeRedistributed becomes true only when splits go through locality-based assignment
boolean splitsToBeRedistributed = false;
Set<Split> remainingSplits = new HashSet<>();
// Check if the current stage has a TableScanNode which is reading the table for the 2nd time or beyond
if (stage.isPresent() && stage.get().getStateMachine().getConsumerScanNode() != null) {
try {
// if node exists, get the TableScanNode and cast it as consumer
TableScanNode consumer = stage.get().getStateMachine().getConsumerScanNode();
// all tables part of this stage
Map<PlanNodeId, TableInfo> tables = stage.get().getStageInfo().getTables();
QualifiedObjectName tableName;
for (Map.Entry<PlanNodeId, TableInfo> entry : tables.entrySet()) {
tableName = entry.getValue().getTableName();
if (tableSplitAssignmentInfo.getReuseTableScanMappingIdSplitAssignmentMap().containsKey(consumer.getReuseTableScanMappingId())) {
// compare splitkey using equals and then assign nodes accordingly.
HashMap<SplitKey, InternalNode> splitKeyNodeAssignment = tableSplitAssignmentInfo.getSplitKeyNodeAssignment(consumer.getReuseTableScanMappingId());
Set<SplitKey> splitKeySet = splitKeyNodeAssignment.keySet();
assignment.putAll(createConsumerScanNodeAssignment(tableName, splits, splitKeySet, splitKeyNodeAssignment));
for (Map.Entry<InternalNode, Split> nodeAssignmentEntry : assignment.entries()) {
InternalNode node = nodeAssignmentEntry.getKey();
assignmentStats.addAssignedSplit(node);
}
}
}
log.debug("Consumer:: Assignment size is " + assignment.size() + " ,Assignment is " + assignment + " ,Assignment Stats is " + assignmentStats);
} catch (NotImplementedException e) {
log.error("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
throw new UnsupportedOperationException("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
}
} else {
// optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain locality information
if (optimizedLocalScheduling) {
// should not hit for consumer case
for (Split split : splits) {
if (split.isRemotelyAccessible() && !split.getAddresses().isEmpty()) {
List<InternalNode> candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
Optional<InternalNode> chosenNode = candidateNodes.stream().filter(ownerNode -> assignmentStats.getTotalSplitCount(ownerNode) < maxSplitsPerNode).min(comparingInt(assignmentStats::getTotalSplitCount));
if (chosenNode.isPresent()) {
assignment.put(chosenNode.get(), split);
// check later
assignmentStats.addAssignedSplit(chosenNode.get());
splitsToBeRedistributed = true;
continue;
}
}
remainingSplits.add(split);
}
} else {
remainingSplits = splits;
}
for (Split split : remainingSplits) {
randomCandidates.reset();
List<InternalNode> candidateNodes;
if (!split.isRemotelyAccessible()) {
candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
} else {
candidateNodes = selectNodes(minCandidates, randomCandidates);
}
if (candidateNodes.isEmpty()) {
log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMapSlice.getNodesByHost().keys());
throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
}
InternalNode chosenNode = null;
int min = Integer.MAX_VALUE;
for (InternalNode node : candidateNodes) {
int totalSplitCount = assignmentStats.getTotalSplitCount(node);
if (totalSplitCount < min && totalSplitCount < maxSplitsPerNode) {
chosenNode = node;
min = totalSplitCount;
}
}
if (chosenNode == null) {
// min is guaranteed to be MAX_VALUE at this line
for (InternalNode node : candidateNodes) {
int totalSplitCount = assignmentStats.getQueuedSplitCountForStage(node);
if (totalSplitCount < min && totalSplitCount < maxPendingSplitsPerTask) {
chosenNode = node;
min = totalSplitCount;
}
}
}
if (chosenNode != null) {
assignment.put(chosenNode, split);
assignmentStats.addAssignedSplit(chosenNode);
} else {
if (split.isRemotelyAccessible()) {
splitWaitingForAnyNode = true;
} else // Exact node set won't matter, if a split is waiting for any node
if (!splitWaitingForAnyNode) {
blockedExactNodes.addAll(candidateNodes);
}
}
}
}
ListenableFuture<?> blocked;
if (splitWaitingForAnyNode) {
blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
} else {
blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
}
if (!stage.isPresent() || stage.get().getStateMachine().getConsumerScanNode() == null) {
if (splitsToBeRedistributed) {
// skip for consumer
equateDistribution(assignment, assignmentStats, nodeMapSlice);
}
}
// Check if the current stage has a TableScanNode which is reading the table for the 1st time
if (stage.isPresent() && stage.get().getStateMachine().getProducerScanNode() != null) {
// if node exists, get the TableScanNode and annotate it as producer
saveProducerScanNodeAssignment(stage, assignment, assignmentStats);
}
// Check if its CTE node and its feeder
if (stage.isPresent() && stage.get().getFragment().getFeederCTEId().isPresent()) {
updateFeederNodeAndSplitCount(stage.get(), assignment);
}
return new SplitPlacementResult(blocked, assignment);
}
use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.
the class SplitCacheAwareNodeSelector method computeAssignments.
@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks, Optional<SqlStageExecution> stage) {
Multimap<InternalNode, Split> assignment = HashMultimap.create();
NodeMap nodeMapSlice = this.nodeMap.get().get();
Map<CatalogName, Map<String, InternalNode>> activeNodesByCatalog = new HashMap<>();
NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMapSlice, existingTasks);
Set<Split> uncacheableSplits = new HashSet<>();
Set<Split> newCacheableSplits = new HashSet<>();
SplitCacheMap splitCacheMap = SplitCacheMap.getInstance();
for (Split split : splits) {
Optional<String> assignedNodeId = Optional.empty();
SplitKey splitKey = createSplitKey(split);
if (splitKey != null) {
assignedNodeId = splitCacheMap.getCachedNodeId(splitKey);
}
if (!split.getConnectorSplit().isCacheable() || splitKey == null) {
// uncacheable splits will be scheduled using default node selector
uncacheableSplits.add(split);
continue;
}
Map<String, InternalNode> activeNodes = activeNodesByCatalog.computeIfAbsent(split.getCatalogName(), catalogName -> nodeManager.getActiveConnectorNodes(catalogName).stream().collect(Collectors.toMap(InternalNode::getNodeIdentifier, Function.identity())));
InternalNode assignedNode = assignedNodeId.map(activeNodes::get).orElse(null);
// check if a node has been assigned and ensure it is still active before scheduling
if (assignedNode != null) {
// split has been previously assigned to a node
// assign the split to the same node as before
assignment.put(assignedNode, split);
assignmentStats.addAssignedSplit(assignedNode);
} else {
// splits that have not be previously cached or the assigned node is now inactive
newCacheableSplits.add(split);
}
}
log.info("%d out of %d splits already cached. %d new splits to be cached. %d splits cannot be cached.", assignment.size(), splits.size(), newCacheableSplits.size(), uncacheableSplits.size());
Set<Split> unassignedSplits = new HashSet<>();
unassignedSplits.addAll(newCacheableSplits);
unassignedSplits.addAll(uncacheableSplits);
// Compute split assignments for splits that cannot be cached, newly cacheable, and already cached but cached worker is inactive now.
SplitPlacementResult defaultSplitPlacementResult = defaultNodeSelector.computeAssignments(unassignedSplits, existingTasks, stage);
defaultSplitPlacementResult.getAssignments().forEach(((internalNode, split) -> {
// Set or Update cached node id only if split is cacheable
if (newCacheableSplits.contains(split)) {
SplitKey splitKey = createSplitKey(split);
if (splitKey != null) {
splitCacheMap.addCachedNode(splitKey, internalNode.getNodeIdentifier());
}
}
assignmentStats.addAssignedSplit(internalNode);
}));
assignment.putAll(defaultSplitPlacementResult.getAssignments());
// Check if its CTE node and its feeder
if (stage.isPresent() && stage.get().getFragment().getFeederCTEId().isPresent()) {
updateFeederNodeAndSplitCount(stage.get(), assignment);
}
return new SplitPlacementResult(defaultSplitPlacementResult.getBlocked(), assignment);
}
use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.
the class TableSplitAssignmentInfo method setPerTablesplitKeyNodeAssignment.
/**
* Store the inverted assignment information [Split-Node mapping] for a given reuseTableScanMappingId number
* @param qualifiedTableName name of the table which is as a producer(reads data from disk for the first time)
* @param reuseTableScanMappingId unique identifier for producer-consumer pair for a reused table
* @param assignmentInformation node-split assignment multimap created as part of the stage that processes this table
* NOTE: Works only with Hive data as other connectors don't support SplitKey currently
*/
private void setPerTablesplitKeyNodeAssignment(QualifiedObjectName qualifiedTableName, UUID reuseTableScanMappingId, Multimap<InternalNode, Split> assignmentInformation) {
String catalog = qualifiedTableName.getCatalogName();
String schema = qualifiedTableName.getSchemaName();
String table = qualifiedTableName.getObjectName();
HashMap<SplitKey, InternalNode> splitKeyNodeAssignment;
try {
splitKeyNodeAssignment = perTableReuseTableScanMappingIdSplitKeyNodeAssignment.get(reuseTableScanMappingId);
if (splitKeyNodeAssignment == null) {
splitKeyNodeAssignment = new HashMap<>();
}
for (InternalNode node : assignmentInformation.keySet()) {
Collection<Split> assigmentSplits = assignmentInformation.get(node);
for (Split assigmentSplit : assigmentSplits) {
if (assigmentSplit.getConnectorSplit().getSplitCount() > 1) {
for (Split unwrappedSplit : assigmentSplit.getSplits()) {
SplitKey splitKey = new SplitKey(unwrappedSplit, catalog, schema, table);
splitKeyNodeAssignment.put(splitKey, node);
}
} else {
SplitKey splitKey = new SplitKey(assigmentSplit, catalog, schema, table);
splitKeyNodeAssignment.put(splitKey, node);
}
}
}
perTableReuseTableScanMappingIdSplitKeyNodeAssignment.put(reuseTableScanMappingId, splitKeyNodeAssignment);
} catch (NotImplementedException e) {
log.error("Unsupported split type: " + e);
throw new UnsupportedOperationException("Unsupported split type: " + e);
}
}
use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.
the class TopologyAwareNodeSelector method bestNodeSplitCount.
@Nullable
private InternalNode bestNodeSplitCount(Iterator<InternalNode> candidates, int minCandidatesWhenFull, int maxPendingSplitsPerTask, NodeAssignmentStats assignmentStats) {
InternalNode bestQueueNotFull = null;
int min = Integer.MAX_VALUE;
int fullCandidatesConsidered = 0;
while (candidates.hasNext() && (fullCandidatesConsidered < minCandidatesWhenFull || bestQueueNotFull == null)) {
InternalNode node = candidates.next();
if (assignmentStats.getTotalSplitCount(node) < maxSplitsPerNode) {
return node;
}
fullCandidatesConsidered++;
int totalSplitCount = assignmentStats.getQueuedSplitCountForStage(node);
if (totalSplitCount < min && totalSplitCount < maxPendingSplitsPerTask) {
min = totalSplitCount;
bestQueueNotFull = node;
}
}
return bestQueueNotFull;
}
use of io.prestosql.metadata.InternalNode in project hetu-core by openlookeng.
the class DynamicLifespanScheduler method schedule.
@Override
public SettableFuture<?> schedule(SourceScheduler scheduler) {
// Return a new future even if newDriverGroupReady has not finished.
// Returning the same SettableFuture instance could lead to ListenableFuture retaining too many listener objects.
checkState(initialScheduled);
List<Lifespan> tmpRecentlyCompletedDriverGroups;
synchronized (this) {
tmpRecentlyCompletedDriverGroups = ImmutableList.copyOf(this.recentlyCompletedDriverGroups);
this.recentlyCompletedDriverGroups.clear();
newDriverGroupReady = SettableFuture.create();
}
for (Lifespan driverGroup : tmpRecentlyCompletedDriverGroups) {
if (!driverGroups.hasNext()) {
break;
}
int driverGroupId = driverGroups.nextInt();
InternalNode nodeForCompletedDriverGroup = bucketNodeMap.getAssignedNode(driverGroup.getId()).orElseThrow(IllegalStateException::new);
bucketNodeMap.assignBucketToNode(driverGroupId, nodeForCompletedDriverGroup);
scheduler.startLifespan(Lifespan.driverGroup(driverGroupId), partitionHandles.get(driverGroupId));
}
if (!driverGroups.hasNext()) {
scheduler.noMoreLifespans();
}
return newDriverGroupReady;
}
Aggregations