use of io.trino.metadata.Split in project trino by trinodb.
the class UniformNodeSelector method equateDistribution.
/**
* The method tries to make the distribution of splits more uniform. All nodes are arranged into a maxHeap and a minHeap
* based on the number of splits that are assigned to them. Splits are redistributed, one at a time, from a maxNode to a
* minNode until we have as uniform a distribution as possible.
*
* @param assignment the node-splits multimap after the first and the second stage
* @param assignmentStats required to obtain info regarding splits assigned to a node outside the current batch of assignment
* @param nodeMap to get a list of all nodes to which splits can be assigned
*/
private void equateDistribution(Multimap<InternalNode, Split> assignment, NodeAssignmentStats assignmentStats, NodeMap nodeMap, boolean includeCoordinator) {
if (assignment.isEmpty()) {
return;
}
Collection<InternalNode> allNodes = nodeMap.getNodesByHostAndPort().values().stream().filter(node -> includeCoordinator || !nodeMap.getCoordinatorNodeIds().contains(node.getNodeIdentifier())).collect(toImmutableList());
if (allNodes.size() < 2) {
return;
}
IndexedPriorityQueue<InternalNode> maxNodes = new IndexedPriorityQueue<>();
for (InternalNode node : assignment.keySet()) {
maxNodes.addOrUpdate(node, assignmentStats.getTotalSplitsWeight(node));
}
IndexedPriorityQueue<InternalNode> minNodes = new IndexedPriorityQueue<>();
for (InternalNode node : allNodes) {
minNodes.addOrUpdate(node, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(node));
}
while (true) {
if (maxNodes.isEmpty()) {
return;
}
// fetch min and max node
InternalNode maxNode = maxNodes.poll();
InternalNode minNode = minNodes.poll();
// misassigned splits greatly (in absolute values).
if (assignmentStats.getTotalSplitsWeight(maxNode) - assignmentStats.getTotalSplitsWeight(minNode) <= SplitWeight.rawValueForStandardSplitCount(5)) {
return;
}
// move split from max to min
Split redistributed = redistributeSplit(assignment, maxNode, minNode, nodeMap.getNodesByHost());
assignmentStats.removeAssignedSplit(maxNode, redistributed.getSplitWeight());
assignmentStats.addAssignedSplit(minNode, redistributed.getSplitWeight());
// add max back into maxNodes only if it still has assignments
if (assignment.containsKey(maxNode)) {
maxNodes.addOrUpdate(maxNode, assignmentStats.getTotalSplitsWeight(maxNode));
}
// Add or update both the Priority Queues with the updated node priorities
maxNodes.addOrUpdate(minNode, assignmentStats.getTotalSplitsWeight(minNode));
minNodes.addOrUpdate(minNode, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(minNode));
minNodes.addOrUpdate(maxNode, Long.MAX_VALUE - assignmentStats.getTotalSplitsWeight(maxNode));
}
}
use of io.trino.metadata.Split in project trino by trinodb.
the class SourcePartitionedScheduler method schedule.
@Override
public synchronized ScheduleResult schedule() {
dropListenersFromWhenFinishedOrNewLifespansAdded();
int overallSplitAssignmentCount = 0;
ImmutableSet.Builder<RemoteTask> overallNewTasks = ImmutableSet.builder();
List<ListenableFuture<?>> overallBlockedFutures = new ArrayList<>();
boolean anyBlockedOnPlacements = false;
boolean anyBlockedOnNextSplitBatch = false;
boolean anyNotBlocked = false;
for (Entry<Lifespan, ScheduleGroup> entry : scheduleGroups.entrySet()) {
Lifespan lifespan = entry.getKey();
ScheduleGroup scheduleGroup = entry.getValue();
Set<Split> pendingSplits = scheduleGroup.pendingSplits;
if (scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS || scheduleGroup.state == ScheduleGroupState.DONE) {
verify(scheduleGroup.nextSplitBatchFuture == null);
} else if (pendingSplits.isEmpty()) {
// try to get the next batch
if (scheduleGroup.nextSplitBatchFuture == null) {
scheduleGroup.nextSplitBatchFuture = splitSource.getNextBatch(scheduleGroup.partitionHandle, lifespan, splitBatchSize - pendingSplits.size());
long start = System.nanoTime();
addSuccessCallback(scheduleGroup.nextSplitBatchFuture, () -> stageExecution.recordGetSplitTime(start));
}
if (scheduleGroup.nextSplitBatchFuture.isDone()) {
SplitBatch nextSplits = getFutureValue(scheduleGroup.nextSplitBatchFuture);
scheduleGroup.nextSplitBatchFuture = null;
pendingSplits.addAll(nextSplits.getSplits());
if (nextSplits.isLastBatch()) {
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED && pendingSplits.isEmpty()) {
// Add an empty split in case no splits have been produced for the source.
// For source operators, they never take input, but they may produce output.
// This is well handled by the execution engine.
// However, there are certain non-source operators that may produce output without any input,
// for example, 1) an AggregationOperator, 2) a HashAggregationOperator where one of the grouping sets is ().
// Scheduling an empty split kicks off necessary driver instantiation to make this work.
pendingSplits.add(new Split(splitSource.getCatalogName(), new EmptySplit(splitSource.getCatalogName()), lifespan));
}
scheduleGroup.state = ScheduleGroupState.NO_MORE_SPLITS;
}
} else {
overallBlockedFutures.add(scheduleGroup.nextSplitBatchFuture);
anyBlockedOnNextSplitBatch = true;
continue;
}
}
Multimap<InternalNode, Split> splitAssignment = ImmutableMultimap.of();
if (!pendingSplits.isEmpty()) {
if (!scheduleGroup.placementFuture.isDone()) {
anyBlockedOnPlacements = true;
continue;
}
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED) {
scheduleGroup.state = ScheduleGroupState.SPLITS_ADDED;
}
if (state == State.INITIALIZED) {
state = State.SPLITS_ADDED;
}
// calculate placements for splits
SplitPlacementResult splitPlacementResult = splitPlacementPolicy.computeAssignments(pendingSplits);
splitAssignment = splitPlacementResult.getAssignments();
// remove splits with successful placements
// AbstractSet.removeAll performs terribly here.
splitAssignment.values().forEach(pendingSplits::remove);
overallSplitAssignmentCount += splitAssignment.size();
// if not completed placed, mark scheduleGroup as blocked on placement
if (!pendingSplits.isEmpty()) {
scheduleGroup.placementFuture = splitPlacementResult.getBlocked();
overallBlockedFutures.add(scheduleGroup.placementFuture);
anyBlockedOnPlacements = true;
}
}
// if no new splits will be assigned, update state and attach completion event
Multimap<InternalNode, Lifespan> noMoreSplitsNotification = ImmutableMultimap.of();
if (pendingSplits.isEmpty() && scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS) {
scheduleGroup.state = ScheduleGroupState.DONE;
if (!lifespan.isTaskWide()) {
InternalNode node = ((BucketedSplitPlacementPolicy) splitPlacementPolicy).getNodeForBucket(lifespan.getId());
noMoreSplitsNotification = ImmutableMultimap.of(node, lifespan);
}
}
// assign the splits with successful placements
overallNewTasks.addAll(assignSplits(splitAssignment, noMoreSplitsNotification));
// As a result, to avoid busy loops caused by 1, we check pendingSplits.isEmpty() instead of placementFuture.isDone() here.
if (scheduleGroup.nextSplitBatchFuture == null && scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state != ScheduleGroupState.DONE) {
anyNotBlocked = true;
}
}
// Next time it invokes getNextBatch, it will realize that. However, the invocation will fail we tear down splitSource now.
if ((state == State.NO_MORE_SPLITS || state == State.FINISHED) || (noMoreScheduleGroups && scheduleGroups.isEmpty() && splitSource.isFinished())) {
switch(state) {
case INITIALIZED:
// But this shouldn't be possible. See usage of EmptySplit in this method.
throw new IllegalStateException("At least 1 split should have been scheduled for this plan node");
case SPLITS_ADDED:
state = State.NO_MORE_SPLITS;
Optional<List<Object>> tableExecuteSplitsInfo = splitSource.getTableExecuteSplitsInfo();
// Here we assume that we can get non-empty tableExecuteSplitsInfo only for queries which facilitate single split source.
// TODO support grouped execution
tableExecuteSplitsInfo.ifPresent(info -> {
TableExecuteContext tableExecuteContext = tableExecuteContextManager.getTableExecuteContextForQuery(stageExecution.getStageId().getQueryId());
tableExecuteContext.setSplitsInfo(info);
});
splitSource.close();
// fall through
case NO_MORE_SPLITS:
state = State.FINISHED;
whenFinishedOrNewLifespanAdded.set(null);
// fall through
case FINISHED:
return new ScheduleResult(true, overallNewTasks.build(), overallSplitAssignmentCount);
}
throw new IllegalStateException("Unknown state");
}
if (anyNotBlocked) {
return new ScheduleResult(false, overallNewTasks.build(), overallSplitAssignmentCount);
}
boolean anySourceTaskBlocked = this.anySourceTaskBlocked.getAsBoolean();
if (anySourceTaskBlocked) {
// Dynamic filters might not be collected due to build side source tasks being blocked on full buffer.
// In such case probe split generation that is waiting for dynamic filters should be unblocked to prevent deadlock.
dynamicFilterService.unblockStageDynamicFilters(stageExecution.getStageId().getQueryId(), stageExecution.getAttemptId(), stageExecution.getFragment());
}
if (groupedExecution) {
overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
} else if (anyBlockedOnPlacements && anySourceTaskBlocked) {
// In a broadcast join, output buffers of the tasks in build source stage have to
// hold onto all data produced before probe side task scheduling finishes,
// even if the data is acknowledged by all known consumers. This is because
// new consumers may be added until the probe side task scheduling finishes.
//
// As a result, the following line is necessary to prevent deadlock
// due to neither build nor probe can make any progress.
// The build side blocks due to a full output buffer.
// In the meantime the probe side split cannot be consumed since
// builder side hash table construction has not finished.
overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
}
ScheduleResult.BlockedReason blockedReason;
if (anyBlockedOnNextSplitBatch) {
blockedReason = anyBlockedOnPlacements ? MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE : WAITING_FOR_SOURCE;
} else {
blockedReason = anyBlockedOnPlacements ? SPLIT_QUEUES_FULL : NO_ACTIVE_DRIVER_GROUP;
}
overallBlockedFutures.add(whenFinishedOrNewLifespanAdded);
return new ScheduleResult(false, overallNewTasks.build(), nonCancellationPropagating(asVoid(whenAnyComplete(overallBlockedFutures))), blockedReason, overallSplitAssignmentCount);
}
use of io.trino.metadata.Split in project trino by trinodb.
the class SourcePartitionedScheduler method assignSplits.
private Set<RemoteTask> assignSplits(Multimap<InternalNode, Split> splitAssignment, Multimap<InternalNode, Lifespan> noMoreSplitsNotification) {
ImmutableSet.Builder<RemoteTask> newTasks = ImmutableSet.builder();
ImmutableSet<InternalNode> nodes = ImmutableSet.<InternalNode>builder().addAll(splitAssignment.keySet()).addAll(noMoreSplitsNotification.keySet()).build();
for (InternalNode node : nodes) {
// source partitioned tasks can only receive broadcast data; otherwise it would have a different distribution
ImmutableMultimap<PlanNodeId, Split> splits = ImmutableMultimap.<PlanNodeId, Split>builder().putAll(partitionedNode, splitAssignment.get(node)).build();
ImmutableMultimap.Builder<PlanNodeId, Lifespan> noMoreSplits = ImmutableMultimap.builder();
if (noMoreSplitsNotification.containsKey(node)) {
noMoreSplits.putAll(partitionedNode, noMoreSplitsNotification.get(node));
}
RemoteTask task = scheduledTasks.get(node);
if (task != null) {
task.addSplits(splits);
noMoreSplits.build().forEach(task::noMoreSplits);
} else {
scheduleTask(node, splits, noMoreSplits.build()).ifPresent(newTasks::add);
}
}
return newTasks.build();
}
use of io.trino.metadata.Split in project trino by trinodb.
the class TestScanFilterAndProjectOperator method testPageSource.
@Test
public void testPageSource() {
Page input = SequencePageBuilder.createSequencePage(ImmutableList.of(VARCHAR), 10_000, 0);
DriverContext driverContext = newDriverContext();
List<RowExpression> projections = ImmutableList.of(field(0, VARCHAR));
Supplier<CursorProcessor> cursorProcessor = functionAssertions.getExpressionCompiler().compileCursorProcessor(Optional.empty(), projections, "key");
Supplier<PageProcessor> pageProcessor = functionAssertions.getExpressionCompiler().compilePageProcessor(Optional.empty(), projections);
ScanFilterAndProjectOperator.ScanFilterAndProjectOperatorFactory factory = new ScanFilterAndProjectOperator.ScanFilterAndProjectOperatorFactory(0, new PlanNodeId("test"), new PlanNodeId("0"), (session, split, table, columns, dynamicFilter) -> new FixedPageSource(ImmutableList.of(input)), cursorProcessor, pageProcessor, TEST_TABLE_HANDLE, ImmutableList.of(), DynamicFilter.EMPTY, ImmutableList.of(VARCHAR), DataSize.ofBytes(0), 0);
SourceOperator operator = factory.createOperator(driverContext);
operator.addSplit(new Split(new CatalogName("test"), TestingSplit.createLocalSplit(), Lifespan.taskWide()));
operator.noMoreSplits();
MaterializedResult expected = toMaterializedResult(driverContext.getSession(), ImmutableList.of(VARCHAR), ImmutableList.of(input));
MaterializedResult actual = toMaterializedResult(driverContext.getSession(), ImmutableList.of(VARCHAR), toPages(operator));
assertEquals(actual.getRowCount(), expected.getRowCount());
assertEquals(actual, expected);
}
use of io.trino.metadata.Split in project trino by trinodb.
the class TestScanFilterAndProjectOperator method testPageSourceMergeOutput.
@Test
public void testPageSourceMergeOutput() {
List<Page> input = rowPagesBuilder(BIGINT).addSequencePage(100, 0).addSequencePage(100, 0).addSequencePage(100, 0).addSequencePage(100, 0).build();
RowExpression filter = call(functionAssertions.getTestingFunctionResolution().resolveOperator(EQUAL, ImmutableList.of(BIGINT, BIGINT)), field(0, BIGINT), constant(10L, BIGINT));
List<RowExpression> projections = ImmutableList.of(field(0, BIGINT));
Supplier<CursorProcessor> cursorProcessor = functionAssertions.getExpressionCompiler().compileCursorProcessor(Optional.of(filter), projections, "key");
Supplier<PageProcessor> pageProcessor = functionAssertions.getExpressionCompiler().compilePageProcessor(Optional.of(filter), projections);
ScanFilterAndProjectOperator.ScanFilterAndProjectOperatorFactory factory = new ScanFilterAndProjectOperator.ScanFilterAndProjectOperatorFactory(0, new PlanNodeId("test"), new PlanNodeId("0"), (session, split, table, columns, dynamicFilter) -> new FixedPageSource(input), cursorProcessor, pageProcessor, TEST_TABLE_HANDLE, ImmutableList.of(), DynamicFilter.EMPTY, ImmutableList.of(BIGINT), DataSize.of(64, KILOBYTE), 2);
SourceOperator operator = factory.createOperator(newDriverContext());
operator.addSplit(new Split(new CatalogName("test"), TestingSplit.createLocalSplit(), Lifespan.taskWide()));
operator.noMoreSplits();
List<Page> actual = toPages(operator);
assertEquals(actual.size(), 1);
List<Page> expected = rowPagesBuilder(BIGINT).row(10L).row(10L).row(10L).row(10L).build();
assertPageEquals(ImmutableList.of(BIGINT), actual.get(0), expected.get(0));
}
Aggregations