use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class NodeScheduler method selectExactNodes.
public static List<InternalNode> selectExactNodes(NodeMap nodeMap, List<HostAddress> hosts, boolean includeCoordinator) {
Set<InternalNode> chosen = new LinkedHashSet<>();
Set<String> coordinatorIds = nodeMap.getCoordinatorNodeIds();
for (HostAddress host : hosts) {
nodeMap.getAllNodesByHostAndPort().get(host).stream().filter(node -> includeCoordinator || !coordinatorIds.contains(node.getNodeIdentifier())).forEach(chosen::add);
InetAddress address;
try {
address = host.toInetAddress();
} catch (UnknownHostException e) {
// skip hosts that don't resolve
continue;
}
// consider a split with a host without a port as being accessible by all nodes in that host
if (!host.hasPort()) {
nodeMap.getAllNodesByHost().get(address).stream().filter(node -> includeCoordinator || !coordinatorIds.contains(node.getNodeIdentifier())).forEach(chosen::add);
}
}
// if the chosen set is empty and the host is the coordinator, force pick the coordinator
if (chosen.isEmpty() && !includeCoordinator) {
for (HostAddress host : hosts) {
// In the code below, before calling `chosen::add`, it could have been checked that
// `coordinatorIds.contains(node.getNodeIdentifier())`. But checking the condition isn't necessary
// because every node satisfies it. Otherwise, `chosen` wouldn't have been empty.
nodeMap.getAllNodesByHostAndPort().get(host).stream().forEach(chosen::add);
InetAddress address;
try {
address = host.toInetAddress();
} catch (UnknownHostException e) {
// skip hosts that don't resolve
continue;
}
// consider a split with a host without a port as being accessible by all nodes in that host
if (!host.hasPort()) {
nodeMap.getAllNodesByHost().get(address).stream().forEach(chosen::add);
}
}
}
return ImmutableList.copyOf(chosen);
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class NodeScheduler method selectDistributionNodes.
public static SplitPlacementResult selectDistributionNodes(NodeMap nodeMap, NodeTaskMap nodeTaskMap, long maxSplitsWeightPerNode, long maxPendingSplitsWeightPerTask, int maxUnacknowledgedSplitsPerTask, Set<Split> splits, List<RemoteTask> existingTasks, BucketNodeMap bucketNodeMap, NodeSelectionStats nodeSelectionStats) {
Multimap<InternalNode, Split> assignments = HashMultimap.create();
NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
Set<InternalNode> blockedNodes = new HashSet<>();
for (Split split : splits) {
// node placement is forced by the bucket to node map
InternalNode node = bucketNodeMap.getAssignedNode(split).get();
boolean isCacheable = bucketNodeMap.isSplitCacheable(split);
SplitWeight splitWeight = split.getSplitWeight();
// if node is full, don't schedule now, which will push back on the scheduling of splits
if (canAssignSplitToDistributionNode(assignmentStats, node, maxSplitsWeightPerNode, maxPendingSplitsWeightPerTask, maxUnacknowledgedSplitsPerTask, splitWeight)) {
if (isCacheable) {
split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(true));
nodeSelectionStats.incrementBucketedPreferredNodeSelectedCount();
} else {
nodeSelectionStats.incrementBucketedNonPreferredNodeSelectedCount();
}
assignments.put(node, split);
assignmentStats.addAssignedSplit(node, splitWeight);
} else {
blockedNodes.add(node);
}
}
ListenableFuture<?> blocked = toWhenHasSplitQueueSpaceFuture(blockedNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
return new SplitPlacementResult(blocked, ImmutableMultimap.copyOf(assignments));
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class SimpleNodeSelector method computeAssignments.
@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks) {
Multimap<InternalNode, Split> assignment = HashMultimap.create();
NodeMap nodeMap = this.nodeMap.get().get();
NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks);
List<InternalNode> eligibleNodes = getEligibleNodes(maxTasksPerStage, nodeMap, existingTasks);
NodeSelection randomNodeSelection = new RandomNodeSelection(eligibleNodes, minCandidates);
Set<InternalNode> blockedExactNodes = new HashSet<>();
boolean splitWaitingForAnyNode = false;
NodeProvider nodeProvider = nodeMap.getActiveNodeProvider(nodeSelectionHashStrategy);
OptionalInt preferredNodeCount = OptionalInt.empty();
for (Split split : splits) {
List<InternalNode> candidateNodes;
switch(split.getNodeSelectionStrategy()) {
case HARD_AFFINITY:
candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
preferredNodeCount = OptionalInt.of(candidateNodes.size());
break;
case SOFT_AFFINITY:
// Using all nodes for soft affinity scheduling with modular hashing because otherwise temporarily down nodes would trigger too much rehashing
if (nodeSelectionHashStrategy == MODULAR_HASHING) {
nodeProvider = new ModularHashingNodeProvider(nodeMap.getAllNodes());
}
candidateNodes = selectExactNodes(nodeMap, split.getPreferredNodes(nodeProvider), includeCoordinator);
preferredNodeCount = OptionalInt.of(candidateNodes.size());
candidateNodes = ImmutableList.<InternalNode>builder().addAll(candidateNodes).addAll(randomNodeSelection.pickNodes(split)).build();
break;
case NO_PREFERENCE:
candidateNodes = randomNodeSelection.pickNodes(split);
break;
default:
throw new PrestoException(NODE_SELECTION_NOT_SUPPORTED, format("Unsupported node selection strategy %s", split.getNodeSelectionStrategy()));
}
if (candidateNodes.isEmpty()) {
log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMap.getActiveNodes());
throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
}
SplitWeight splitWeight = split.getSplitWeight();
Optional<InternalNodeInfo> chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
if (!chosenNodeInfo.isPresent()) {
chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
}
if (chosenNodeInfo.isPresent()) {
split = new Split(split.getConnectorId(), split.getTransactionHandle(), split.getConnectorSplit(), split.getLifespan(), new SplitContext(chosenNodeInfo.get().isCacheable()));
InternalNode chosenNode = chosenNodeInfo.get().getInternalNode();
assignment.put(chosenNode, split);
assignmentStats.addAssignedSplit(chosenNode, splitWeight);
} else {
if (split.getNodeSelectionStrategy() != HARD_AFFINITY) {
splitWaitingForAnyNode = true;
} else // Exact node set won't matter, if a split is waiting for any node
if (!splitWaitingForAnyNode) {
blockedExactNodes.addAll(candidateNodes);
}
}
}
ListenableFuture<?> blocked;
if (splitWaitingForAnyNode) {
blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
} else {
blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsWeightPerTask));
}
return new SplitPlacementResult(blocked, assignment);
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class SimpleTtlNodeSelector method getEligibleNodes.
private List<InternalNode> getEligibleNodes(int limit, NodeMap nodeMap, List<RemoteTask> existingTasks) {
Map<InternalNode, NodeTtl> nodeTtlInfo = nodeTtlFetcherManager.getAllTtls();
Map<InternalNode, Optional<ConfidenceBasedTtlInfo>> ttlInfo = nodeTtlInfo.entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, e -> e.getValue().getTtlInfo().stream().min(Comparator.comparing(ConfidenceBasedTtlInfo::getExpiryInstant))));
Duration estimatedExecutionTimeRemaining = getEstimatedExecutionTimeRemaining();
// Of the nodes on which already have existing tasks, pick only those whose TTL is enough
List<InternalNode> existingEligibleNodes = existingTasks.stream().map(remoteTask -> nodeMap.getActiveNodesByNodeId().get(remoteTask.getNodeId())).filter(Objects::nonNull).filter(ttlInfo::containsKey).filter(node -> ttlInfo.get(node).isPresent()).filter(node -> isTtlEnough(ttlInfo.get(node).get(), estimatedExecutionTimeRemaining)).collect(toList());
int alreadySelectedNodeCount = existingEligibleNodes.size();
List<InternalNode> activeNodes = nodeMap.getActiveNodes();
List<InternalNode> newEligibleNodes = filterNodesByTtl(activeNodes, ImmutableSet.copyOf(existingEligibleNodes), ttlInfo, estimatedExecutionTimeRemaining);
if (alreadySelectedNodeCount < limit && newEligibleNodes.size() > 0) {
List<InternalNode> moreNodes = selectNodes(limit - alreadySelectedNodeCount, new ResettableRandomizedIterator<>(newEligibleNodes));
existingEligibleNodes.addAll(moreNodes);
}
verify(existingEligibleNodes.stream().allMatch(Objects::nonNull), "existingNodes list must not contain any nulls");
return existingEligibleNodes;
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class SqlStageExecution method scheduleTask.
private synchronized RemoteTask scheduleTask(InternalNode node, TaskId taskId, Multimap<PlanNodeId, Split> sourceSplits) {
checkArgument(!allTasks.contains(taskId), "A task with id %s already exists", taskId);
ImmutableMultimap.Builder<PlanNodeId, Split> initialSplits = ImmutableMultimap.builder();
initialSplits.putAll(sourceSplits);
sourceTasks.forEach((planNodeId, task) -> {
TaskStatus status = task.getTaskStatus();
if (status.getState() != TaskState.FINISHED) {
initialSplits.put(planNodeId, createRemoteSplitFor(taskId, task.getRemoteTaskLocation(), task.getTaskId()));
}
});
OutputBuffers outputBuffers = this.outputBuffers.get();
checkState(outputBuffers != null, "Initial output buffers must be set before a task can be scheduled");
RemoteTask task = remoteTaskFactory.createRemoteTask(session, taskId, node, planFragment, initialSplits.build(), outputBuffers, nodeTaskMap.createTaskStatsTracker(node, taskId), summarizeTaskInfo, tableWriteInfo);
completeSources.forEach(task::noMoreSplits);
allTasks.add(taskId);
tasks.computeIfAbsent(node, key -> newConcurrentHashSet()).add(task);
nodeTaskMap.addTask(node, task);
task.addStateChangeListener(new StageTaskListener(taskId));
task.addFinalTaskInfoListener(this::updateFinalTaskInfo);
if (!stateMachine.getState().isDone()) {
task.start();
} else {
// stage finished while we were scheduling this task
task.abort();
}
return task;
}
Aggregations