use of io.prestosql.spi.StandardErrorCode.NO_NODES_AVAILABLE in project hetu-core by openlookeng.
the class NodePartitioningManager method getNodePartitioningMap.
public NodePartitionMap getNodePartitioningMap(Session session, PartitioningHandle partitioningHandle, boolean isSnapshotEnabled, Integer nodeCount) {
requireNonNull(session, "session is null");
requireNonNull(partitioningHandle, "partitioningHandle is null");
if (partitioningHandle.getConnectorHandle() instanceof SystemPartitioningHandle) {
return ((SystemPartitioningHandle) partitioningHandle.getConnectorHandle()).getNodePartitionMap(session, nodeScheduler, isSnapshotEnabled, nodeCount);
}
CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(() -> new IllegalArgumentException("No connector ID for partitioning handle: " + partitioningHandle));
ConnectorNodePartitioningProvider partitioningProvider = partitioningProviders.get(catalogName);
checkArgument(partitioningProvider != null, "No partitioning provider for connector %s", catalogName);
ConnectorBucketNodeMap connectorBucketNodeMap = getConnectorBucketNodeMap(session, partitioningHandle);
// safety check for crazy partitioning
checkArgument(connectorBucketNodeMap.getBucketCount() < 1_000_000, "Too many buckets in partitioning: %s", connectorBucketNodeMap.getBucketCount());
List<InternalNode> bucketToNode;
if (connectorBucketNodeMap.hasFixedMapping()) {
bucketToNode = getFixedMapping(connectorBucketNodeMap);
} else {
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, false, null);
List<InternalNode> nodes;
if (isSnapshotEnabled) {
Integer count = nodeCount;
if (count == null) {
// Initial schedule: reserve some nodes
count = calculateTaskCount(nodeSelector.selectableNodeCount());
}
nodes = nodeSelector.selectRandomNodes(count);
checkCondition(nodes.size() == count, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + count);
} else {
nodes = nodeSelector.allNodes();
}
bucketToNode = createArbitraryBucketToNode(nodes, connectorBucketNodeMap.getBucketCount());
}
int[] bucketToPartition = new int[connectorBucketNodeMap.getBucketCount()];
BiMap<InternalNode, Integer> nodeToPartition = HashBiMap.create();
int nextPartitionId = 0;
for (int bucket = 0; bucket < bucketToNode.size(); bucket++) {
InternalNode node = bucketToNode.get(bucket);
Integer partitionId = nodeToPartition.get(node);
if (partitionId == null) {
partitionId = nextPartitionId++;
nodeToPartition.put(node, partitionId);
}
bucketToPartition[bucket] = partitionId;
}
List<InternalNode> partitionToNode = IntStream.range(0, nodeToPartition.size()).mapToObj(partitionId -> nodeToPartition.inverse().get(partitionId)).collect(toImmutableList());
return new NodePartitionMap(partitionToNode, bucketToPartition, getSplitToBucket(session, partitioningHandle));
}
use of io.prestosql.spi.StandardErrorCode.NO_NODES_AVAILABLE in project hetu-core by openlookeng.
the class SimpleNodeSelector method computeAssignments.
@Override
public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTask> existingTasks, Optional<SqlStageExecution> stage) {
Multimap<InternalNode, Split> assignment = HashMultimap.create();
NodeMap nodeMapSlice = this.nodeMap.get().get();
NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMapSlice, existingTasks);
ResettableRandomizedIterator<InternalNode> randomCandidates = randomizedNodes(nodeMapSlice, ImmutableSet.of());
Set<InternalNode> blockedExactNodes = new HashSet<>();
boolean splitWaitingForAnyNode = false;
// splitsToBeRedistributed becomes true only when splits go through locality-based assignment
boolean splitsToBeRedistributed = false;
Set<Split> remainingSplits = new HashSet<>();
// Check if the current stage has a TableScanNode which is reading the table for the 2nd time or beyond
if (stage.isPresent() && stage.get().getStateMachine().getConsumerScanNode() != null) {
try {
// if node exists, get the TableScanNode and cast it as consumer
TableScanNode consumer = stage.get().getStateMachine().getConsumerScanNode();
// all tables part of this stage
Map<PlanNodeId, TableInfo> tables = stage.get().getStageInfo().getTables();
QualifiedObjectName tableName;
for (Map.Entry<PlanNodeId, TableInfo> entry : tables.entrySet()) {
tableName = entry.getValue().getTableName();
if (tableSplitAssignmentInfo.getReuseTableScanMappingIdSplitAssignmentMap().containsKey(consumer.getReuseTableScanMappingId())) {
// compare splitkey using equals and then assign nodes accordingly.
HashMap<SplitKey, InternalNode> splitKeyNodeAssignment = tableSplitAssignmentInfo.getSplitKeyNodeAssignment(consumer.getReuseTableScanMappingId());
Set<SplitKey> splitKeySet = splitKeyNodeAssignment.keySet();
assignment.putAll(createConsumerScanNodeAssignment(tableName, splits, splitKeySet, splitKeyNodeAssignment));
for (Map.Entry<InternalNode, Split> nodeAssignmentEntry : assignment.entries()) {
InternalNode node = nodeAssignmentEntry.getKey();
assignmentStats.addAssignedSplit(node);
}
}
}
log.debug("Consumer:: Assignment size is " + assignment.size() + " ,Assignment is " + assignment + " ,Assignment Stats is " + assignmentStats);
} catch (NotImplementedException e) {
log.error("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
throw new UnsupportedOperationException("Not a Hive Split! Other Connector Splits not supported currently. Error: " + e);
}
} else {
// optimizedLocalScheduling enables prioritized assignment of splits to local nodes when splits contain locality information
if (optimizedLocalScheduling) {
// should not hit for consumer case
for (Split split : splits) {
if (split.isRemotelyAccessible() && !split.getAddresses().isEmpty()) {
List<InternalNode> candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
Optional<InternalNode> chosenNode = candidateNodes.stream().filter(ownerNode -> assignmentStats.getTotalSplitCount(ownerNode) < maxSplitsPerNode).min(comparingInt(assignmentStats::getTotalSplitCount));
if (chosenNode.isPresent()) {
assignment.put(chosenNode.get(), split);
// check later
assignmentStats.addAssignedSplit(chosenNode.get());
splitsToBeRedistributed = true;
continue;
}
}
remainingSplits.add(split);
}
} else {
remainingSplits = splits;
}
for (Split split : remainingSplits) {
randomCandidates.reset();
List<InternalNode> candidateNodes;
if (!split.isRemotelyAccessible()) {
candidateNodes = selectExactNodes(nodeMapSlice, split.getAddresses(), includeCoordinator);
} else {
candidateNodes = selectNodes(minCandidates, randomCandidates);
}
if (candidateNodes.isEmpty()) {
log.debug("No nodes available to schedule %s. Available nodes %s", split, nodeMapSlice.getNodesByHost().keys());
throw new PrestoException(NO_NODES_AVAILABLE, "No nodes available to run query");
}
InternalNode chosenNode = null;
int min = Integer.MAX_VALUE;
for (InternalNode node : candidateNodes) {
int totalSplitCount = assignmentStats.getTotalSplitCount(node);
if (totalSplitCount < min && totalSplitCount < maxSplitsPerNode) {
chosenNode = node;
min = totalSplitCount;
}
}
if (chosenNode == null) {
// min is guaranteed to be MAX_VALUE at this line
for (InternalNode node : candidateNodes) {
int totalSplitCount = assignmentStats.getQueuedSplitCountForStage(node);
if (totalSplitCount < min && totalSplitCount < maxPendingSplitsPerTask) {
chosenNode = node;
min = totalSplitCount;
}
}
}
if (chosenNode != null) {
assignment.put(chosenNode, split);
assignmentStats.addAssignedSplit(chosenNode);
} else {
if (split.isRemotelyAccessible()) {
splitWaitingForAnyNode = true;
} else // Exact node set won't matter, if a split is waiting for any node
if (!splitWaitingForAnyNode) {
blockedExactNodes.addAll(candidateNodes);
}
}
}
}
ListenableFuture<?> blocked;
if (splitWaitingForAnyNode) {
blocked = toWhenHasSplitQueueSpaceFuture(existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
} else {
blocked = toWhenHasSplitQueueSpaceFuture(blockedExactNodes, existingTasks, calculateLowWatermark(maxPendingSplitsPerTask));
}
if (!stage.isPresent() || stage.get().getStateMachine().getConsumerScanNode() == null) {
if (splitsToBeRedistributed) {
// skip for consumer
equateDistribution(assignment, assignmentStats, nodeMapSlice);
}
}
// Check if the current stage has a TableScanNode which is reading the table for the 1st time
if (stage.isPresent() && stage.get().getStateMachine().getProducerScanNode() != null) {
// if node exists, get the TableScanNode and annotate it as producer
saveProducerScanNodeAssignment(stage, assignment, assignmentStats);
}
// Check if its CTE node and its feeder
if (stage.isPresent() && stage.get().getFragment().getFeederCTEId().isPresent()) {
updateFeederNodeAndSplitCount(stage.get(), assignment);
}
return new SplitPlacementResult(blocked, assignment);
}
use of io.prestosql.spi.StandardErrorCode.NO_NODES_AVAILABLE in project hetu-core by openlookeng.
the class SqlQueryScheduler method createStages.
private List<SqlStageExecution> createStages(ExchangeLocationsConsumer parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, BiFunction<PartitioningHandle, Integer, NodePartitionMap> partitioningCache, NodePartitioningManager nodePartitioningManager, ExecutorService queryExecutor, ScheduledExecutorService schedulerExecutor, FailureDetector failureDetector, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages, boolean isSnapshotEnabled, QuerySnapshotManager snapshotManager, Map<StageId, Integer> stageTaskCounts) {
ImmutableList.Builder<SqlStageExecution> localStages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stageExecution = createSqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), plan.getTables(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, queryExecutor, failureDetector, schedulerStats, dynamicFilterService, snapshotManager);
localStages.add(stageExecution);
Optional<int[]> bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
boolean keepConsumerOnFeederNodes = !plan.getFragment().getFeederCTEId().isPresent() && plan.getFragment().getFeederCTEParentId().isPresent();
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
CatalogName catalogName = splitSource.getCatalogName();
if (isInternalSystemConnector(catalogName)) {
catalogName = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
if (isSnapshotEnabled) {
// When snapshot is enabled, then no task can be added after the query started running,
// otherwise assumptions about how many "input channels" may be broken.
nodeSelector.lockDownNodes();
}
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
stageSchedulers.put(stageId, newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize, session, heuristicIndexerManager));
bucketToPartition = Optional.of(new int[1]);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
bucketToPartition = Optional.of(new int[1]);
} else {
Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getPartitionedSources();
CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
if (isSnapshotEnabled) {
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
int nodeCount;
if (stageTaskCounts != null) {
// Resuming: need to create same number of tasks as old stage.
nodeCount = stageTaskCounts.get(stageId);
} else {
// Scheduling: reserve some nodes for resuming
nodeCount = calculateTaskCount(nodeSelector.selectableNodeCount());
}
stageNodeList = new ArrayList<>(nodeSelector.selectRandomNodes(nodeCount));
checkCondition(stageNodeList.size() == nodeCount, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + nodeCount);
// Make sure bucketNodeMap uses the same node list
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule, stageNodeList);
} else {
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes).allNodes());
}
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
Collections.shuffle(stageNodeList);
bucketToPartition = Optional.empty();
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes), connectorPartitionHandles, session, heuristicIndexerManager));
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
stageSchedulers.put(stageId, new FixedCountScheduler(stageExecution, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
if (visitedPlanFrags.contains(subStagePlan.getFragment().getId())) {
continue;
}
visitedPlanFrags.add(subStagePlan.getFragment().getId());
List<SqlStageExecution> subTree = createStages(stageExecution::addExchangeLocations, nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, nodePartitioningManager, queryExecutor, schedulerExecutor, failureDetector, nodeTaskMap, stageSchedulers, stageLinkages, isSnapshotEnabled, snapshotManager, stageTaskCounts);
localStages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
Optional<RemoteSourceNode> parentNode = plan.getFragment().getRemoteSourceNodes().stream().filter(x -> x.getSourceFragmentIds().contains(childStage.getFragment().getId())).findAny();
checkArgument(parentNode.isPresent(), "Couldn't find parent of a CTE node");
childStage.setParentId(parentNode.get().getId());
}
Set<SqlStageExecution> childStages = childStagesBuilder.build();
stageExecution.addStateChangeListener(newState -> {
if (newState.isDone() && newState != StageState.RESCHEDULING) {
// Snapshot: For "rescheduling", tasks are already cancelled (for resume)
childStages.forEach(SqlStageExecution::cancel);
}
});
stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStages.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(null, keepConsumerOnFeederNodes, feederScheduledNodes), schedulerExecutor, getWriterMinSize(session), isSnapshotEnabled, stageTaskCounts != null ? stageTaskCounts.get(stageId) : null);
whenAllStages(childStages, StageState::isDone).addListener(scheduler::finish, directExecutor());
stageSchedulers.put(stageId, scheduler);
}
return localStages.build();
}
Aggregations