use of io.prestosql.sql.planner.PartitioningHandle in project hetu-core by openlookeng.
the class SqlQueryExecution method planDistribution.
private void planDistribution(PlanRoot plan) {
// time distribution planning
stateMachine.beginDistributedPlanning();
// plan the execution on the active nodes
DistributedExecutionPlanner distributedPlanner = new DistributedExecutionPlanner(splitManager, metadata);
StageExecutionPlan outputStageExecutionPlan;
Session session = stateMachine.getSession();
if (SystemSessionProperties.isSnapshotEnabled(session)) {
// Snapshot: need to plan different when snapshot is enabled.
// See the "plan" method for difference between the different modes.
MarkerAnnouncer announcer = splitManager.getMarkerAnnouncer(session);
announcer.setSnapshotManager(snapshotManager);
outputStageExecutionPlan = distributedPlanner.plan(plan.getRoot(), session, SNAPSHOT, null, announcer.currentSnapshotId());
} else {
outputStageExecutionPlan = distributedPlanner.plan(plan.getRoot(), session, NORMAL, null, 0);
}
stateMachine.endDistributedPlanning();
// ensure split sources are closed
stateMachine.addStateChangeListener(state -> {
if (state.isDone()) {
closeSplitSources(outputStageExecutionPlan);
}
});
// if query was canceled, skip creating scheduler
if (stateMachine.isDone()) {
return;
}
// record output field
stateMachine.setColumns(outputStageExecutionPlan.getFieldNames(), outputStageExecutionPlan.getFragment().getTypes());
PartitioningHandle partitioningHandle = plan.getRoot().getFragment().getPartitioningScheme().getPartitioning().getHandle();
OutputBuffers rootOutputBuffers = createInitialEmptyOutputBuffers(partitioningHandle).withBuffer(OUTPUT_BUFFER_ID, BROADCAST_PARTITION_ID).withNoMoreBufferIds();
// build the stage execution objects (this doesn't schedule execution)
SqlQueryScheduler scheduler = createSqlQueryScheduler(stateMachine, locationFactory, outputStageExecutionPlan, nodePartitioningManager, nodeScheduler, remoteTaskFactory, stateMachine.getSession(), plan.isSummarizeTaskInfos(), scheduleSplitBatchSize, queryExecutor, schedulerExecutor, failureDetector, rootOutputBuffers, nodeTaskMap, executionPolicy, schedulerStats, dynamicFilterService, heuristicIndexerManager, snapshotManager, null);
queryScheduler.set(scheduler);
// directly since the callback may have already fired
if (stateMachine.isDone()) {
scheduler.abort();
queryScheduler.set(null);
}
}
use of io.prestosql.sql.planner.PartitioningHandle in project hetu-core by openlookeng.
the class SqlQueryExecution method resumeQuery.
private void resumeQuery(PlanRoot plan) {
SqlQueryScheduler oldScheduler = queryScheduler.get();
try {
// Wait for previous scheduler to finish.
// This is important, otherwise the old schedule may close split sources after the new scheduler has started.
oldScheduler.doneScheduling().get();
} catch (Exception e) {
throw new RuntimeException(e);
}
log.debug("Rescheduling query %s from a resumable task failure.", getQueryId());
PartitioningHandle partitioningHandle = plan.getRoot().getFragment().getPartitioningScheme().getPartitioning().getHandle();
OutputBuffers rootOutputBuffers = createInitialEmptyOutputBuffers(partitioningHandle).withBuffer(OUTPUT_BUFFER_ID, BROADCAST_PARTITION_ID).withNoMoreBufferIds();
// build the stage execution objects (this doesn't schedule execution)
SqlQueryScheduler scheduler;
try {
scheduler = createResumeScheduler(plan, rootOutputBuffers);
} catch (PrestoException e) {
if (e.getErrorCode() == NO_NODES_AVAILABLE.toErrorCode()) {
// Not enough worker to resume all tasks. Retrying from any saved snapshot likely wont' work either.
// Clear ongoing and existing snapshots and restart.
snapshotManager.invalidateAllSnapshots();
scheduler = createResumeScheduler(plan, rootOutputBuffers);
} else {
throw e;
}
}
queryScheduler.set(scheduler);
log.debug("Restarting query %s from a resumable task failure.", getQueryId());
scheduler.start();
stateMachine.transitionToStarting();
}
use of io.prestosql.sql.planner.PartitioningHandle in project hetu-core by openlookeng.
the class SqlQueryScheduler method createStages.
private List<SqlStageExecution> createStages(ExchangeLocationsConsumer parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, BiFunction<PartitioningHandle, Integer, NodePartitionMap> partitioningCache, NodePartitioningManager nodePartitioningManager, ExecutorService queryExecutor, ScheduledExecutorService schedulerExecutor, FailureDetector failureDetector, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages, boolean isSnapshotEnabled, QuerySnapshotManager snapshotManager, Map<StageId, Integer> stageTaskCounts) {
ImmutableList.Builder<SqlStageExecution> localStages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stageExecution = createSqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), plan.getTables(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, queryExecutor, failureDetector, schedulerStats, dynamicFilterService, snapshotManager);
localStages.add(stageExecution);
Optional<int[]> bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
boolean keepConsumerOnFeederNodes = !plan.getFragment().getFeederCTEId().isPresent() && plan.getFragment().getFeederCTEParentId().isPresent();
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
CatalogName catalogName = splitSource.getCatalogName();
if (isInternalSystemConnector(catalogName)) {
catalogName = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
if (isSnapshotEnabled) {
// When snapshot is enabled, then no task can be added after the query started running,
// otherwise assumptions about how many "input channels" may be broken.
nodeSelector.lockDownNodes();
}
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
stageSchedulers.put(stageId, newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize, session, heuristicIndexerManager));
bucketToPartition = Optional.of(new int[1]);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
bucketToPartition = Optional.of(new int[1]);
} else {
Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getPartitionedSources();
CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
if (isSnapshotEnabled) {
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
int nodeCount;
if (stageTaskCounts != null) {
// Resuming: need to create same number of tasks as old stage.
nodeCount = stageTaskCounts.get(stageId);
} else {
// Scheduling: reserve some nodes for resuming
nodeCount = calculateTaskCount(nodeSelector.selectableNodeCount());
}
stageNodeList = new ArrayList<>(nodeSelector.selectRandomNodes(nodeCount));
checkCondition(stageNodeList.size() == nodeCount, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + nodeCount);
// Make sure bucketNodeMap uses the same node list
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule, stageNodeList);
} else {
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes).allNodes());
}
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
Collections.shuffle(stageNodeList);
bucketToPartition = Optional.empty();
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes), connectorPartitionHandles, session, heuristicIndexerManager));
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
stageSchedulers.put(stageId, new FixedCountScheduler(stageExecution, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
if (visitedPlanFrags.contains(subStagePlan.getFragment().getId())) {
continue;
}
visitedPlanFrags.add(subStagePlan.getFragment().getId());
List<SqlStageExecution> subTree = createStages(stageExecution::addExchangeLocations, nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, nodePartitioningManager, queryExecutor, schedulerExecutor, failureDetector, nodeTaskMap, stageSchedulers, stageLinkages, isSnapshotEnabled, snapshotManager, stageTaskCounts);
localStages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
Optional<RemoteSourceNode> parentNode = plan.getFragment().getRemoteSourceNodes().stream().filter(x -> x.getSourceFragmentIds().contains(childStage.getFragment().getId())).findAny();
checkArgument(parentNode.isPresent(), "Couldn't find parent of a CTE node");
childStage.setParentId(parentNode.get().getId());
}
Set<SqlStageExecution> childStages = childStagesBuilder.build();
stageExecution.addStateChangeListener(newState -> {
if (newState.isDone() && newState != StageState.RESCHEDULING) {
// Snapshot: For "rescheduling", tasks are already cancelled (for resume)
childStages.forEach(SqlStageExecution::cancel);
}
});
stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStages.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(null, keepConsumerOnFeederNodes, feederScheduledNodes), schedulerExecutor, getWriterMinSize(session), isSnapshotEnabled, stageTaskCounts != null ? stageTaskCounts.get(stageId) : null);
whenAllStages(childStages, StageState::isDone).addListener(scheduler::finish, directExecutor());
stageSchedulers.put(stageId, scheduler);
}
return localStages.build();
}
use of io.prestosql.sql.planner.PartitioningHandle in project hetu-core by openlookeng.
the class ExchangeNode method mergingExchange.
public static ExchangeNode mergingExchange(PlanNodeId id, Scope scope, PlanNode child, OrderingScheme orderingScheme) {
// CTEScanNode adds one exchange node on top of it,
// so if upper node going to have another ExchangeNode then we should omit previous one.
PlanNode childNode = child;
if (scope == REMOTE && childNode instanceof ExchangeNode && childNode.getSources().size() == 1 && childNode.getSources().get(0) instanceof CTEScanNode) {
childNode = childNode.getSources().get(0);
}
PartitioningHandle partitioningHandle = scope == LOCAL ? FIXED_PASSTHROUGH_DISTRIBUTION : SINGLE_DISTRIBUTION;
return new ExchangeNode(id, Type.GATHER, scope, new PartitioningScheme(Partitioning.create(partitioningHandle, ImmutableList.of()), childNode.getOutputSymbols()), ImmutableList.of(childNode), ImmutableList.of(childNode.getOutputSymbols()), Optional.of(orderingScheme), AggregationNode.AggregationType.HASH);
}
use of io.prestosql.sql.planner.PartitioningHandle in project hetu-core by openlookeng.
the class MetadataManager method getCommonPartitioning.
@Override
public Optional<PartitioningHandle> getCommonPartitioning(Session session, PartitioningHandle left, PartitioningHandle right) {
Optional<CatalogName> leftConnectorId = left.getConnectorId();
Optional<CatalogName> rightConnectorId = right.getConnectorId();
if (!leftConnectorId.isPresent() || !rightConnectorId.isPresent() || !leftConnectorId.equals(rightConnectorId)) {
return Optional.empty();
}
if (!left.getTransactionHandle().equals(right.getTransactionHandle())) {
return Optional.empty();
}
CatalogName catalogName = leftConnectorId.get();
CatalogMetadata catalogMetadata = getCatalogMetadata(session, catalogName);
ConnectorMetadata metadata = catalogMetadata.getMetadataFor(catalogName);
Optional<ConnectorPartitioningHandle> commonHandle = metadata.getCommonPartitioningHandle(session.toConnectorSession(catalogName), left.getConnectorHandle(), right.getConnectorHandle());
return commonHandle.map(handle -> new PartitioningHandle(Optional.of(catalogName), left.getTransactionHandle(), handle));
}
Aggregations