use of com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector in project presto by prestodb.
the class SectionExecutionFactory method createStageScheduler.
private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
int maxTasksPerStage = getMaxTasksPerStage(session);
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
return scheduler;
} else {
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no non-replicated remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
if (bucketNodeMap.hasInitialMap()) {
stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
} else {
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
}
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
}
FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
stageExecution.registerStageTaskRecoveryCallback(taskId -> {
checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
fixedSourcePartitionedScheduler.recover(taskId);
});
}
return fixedSourcePartitionedScheduler;
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
return new FixedCountScheduler(stageExecution, partitionToNode);
}
}
}
use of com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector in project presto by prestodb.
the class TestNodeScheduler method testAffinityAssignmentWithModularHashing.
@Test
public void testAffinityAssignmentWithModularHashing() {
NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setMaxSplitsPerNode(20).setIncludeCoordinator(false).setMaxPendingSplitsPerTask(10);
NodeScheduler nodeScheduler = new NodeScheduler(new LegacyNetworkTopology(), nodeManager, new NodeSelectionStats(), nodeSchedulerConfig, nodeTaskMap, new ThrowingNodeTtlFetcherManager(), new NoOpQueryManager(), new SimpleTtlNodeSelectorConfig());
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 3);
Set<Split> splits = new HashSet<>();
// Adding one more split (1 % 3 = 1), 1 splits will be distributed to 1 nodes
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestAffinitySplitRemote(1)));
SplitPlacementResult splitPlacementResult = nodeSelector.computeAssignments(splits, ImmutableList.of());
Set<InternalNode> internalNodes = splitPlacementResult.getAssignments().keySet();
assertEquals(internalNodes.size(), 1);
// Adding one more split (2 % 3 = 2), 2 splits will be distributed to 2 nodes
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestAffinitySplitRemote(2)));
splitPlacementResult = nodeSelector.computeAssignments(splits, getRemoteTableScanTask(splitPlacementResult));
Set<InternalNode> internalNodesSecondCall = splitPlacementResult.getAssignments().keySet();
assertEquals(internalNodesSecondCall.size(), 2);
// adding one more split(4 % 3 = 1) that will fall into the same slots, 3 splits will be distributed to 2 nodes still
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestAffinitySplitRemote(4)));
splitPlacementResult = nodeSelector.computeAssignments(splits, getRemoteTableScanTask(splitPlacementResult));
assertEquals(splitPlacementResult.getAssignments().keySet().size(), 2);
// adding one more split(3 % 3 = 0) that will fall into different slots, 3 splits will be distributed to 3 nodes
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestAffinitySplitRemote(3)));
splitPlacementResult = nodeSelector.computeAssignments(splits, getRemoteTableScanTask(splitPlacementResult));
assertEquals(splitPlacementResult.getAssignments().keySet().size(), 3);
}
use of com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector in project presto by prestodb.
the class TestNodeScheduler method testTopologyAwareScheduling.
@Test(timeOut = 60 * 1000)
public void testTopologyAwareScheduling() throws Exception {
TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
InMemoryNodeManager nodeManager = new InMemoryNodeManager();
ImmutableList.Builder<InternalNode> nodeBuilder = ImmutableList.builder();
nodeBuilder.add(new InternalNode("node1", URI.create("http://host1.rack1:11"), NodeVersion.UNKNOWN, false));
nodeBuilder.add(new InternalNode("node2", URI.create("http://host2.rack1:12"), NodeVersion.UNKNOWN, false));
nodeBuilder.add(new InternalNode("node3", URI.create("http://host3.rack2:13"), NodeVersion.UNKNOWN, false));
List<InternalNode> nodes = nodeBuilder.build();
nodeManager.addNode(CONNECTOR_ID, nodes);
// contents of taskMap indicate the node-task map for the current stage
Map<InternalNode, RemoteTask> taskMap = new HashMap<>();
NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setMaxSplitsPerNode(25).setIncludeCoordinator(false).setNetworkTopology("test").setMaxPendingSplitsPerTask(20);
TestNetworkTopology topology = new TestNetworkTopology();
NetworkLocationCache locationCache = new NetworkLocationCache(topology) {
@Override
public NetworkLocation get(HostAddress host) {
// Bypass the cache for workers, since we only look them up once and they would all be unresolved otherwise
if (host.getHostText().startsWith("host")) {
return topology.locate(host);
} else {
return super.get(host);
}
}
};
NodeScheduler nodeScheduler = new NodeScheduler(locationCache, topology, nodeManager, new NodeSelectionStats(), nodeSchedulerConfig, nodeTaskMap, new Duration(5, SECONDS), new ThrowingNodeTtlFetcherManager(), new NoOpQueryManager(), new SimpleTtlNodeSelectorConfig());
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID);
// Fill up the nodes with non-local data
ImmutableSet.Builder<Split> nonRackLocalBuilder = ImmutableSet.builder();
for (int i = 0; i < (25 + 11) * 3; i++) {
nonRackLocalBuilder.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("data.other_rack", 1))));
}
Set<Split> nonRackLocalSplits = nonRackLocalBuilder.build();
Multimap<InternalNode, Split> assignments = nodeSelector.computeAssignments(nonRackLocalSplits, ImmutableList.copyOf(taskMap.values())).getAssignments();
MockRemoteTaskFactory remoteTaskFactory = new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor);
int task = 0;
for (InternalNode node : assignments.keySet()) {
TaskId taskId = new TaskId("test", 1, 0, task);
task++;
MockRemoteTaskFactory.MockRemoteTask remoteTask = remoteTaskFactory.createTableScanTask(taskId, node, ImmutableList.copyOf(assignments.get(node)), nodeTaskMap.createTaskStatsTracker(node, taskId));
remoteTask.startSplits(25);
nodeTaskMap.addTask(node, remoteTask);
taskMap.put(node, remoteTask);
}
// Continue assigning to fill up part of the queue
nonRackLocalSplits = Sets.difference(nonRackLocalSplits, new HashSet<>(assignments.values()));
assignments = nodeSelector.computeAssignments(nonRackLocalSplits, ImmutableList.copyOf(taskMap.values())).getAssignments();
for (InternalNode node : assignments.keySet()) {
RemoteTask remoteTask = taskMap.get(node);
remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
}
nonRackLocalSplits = Sets.difference(nonRackLocalSplits, new HashSet<>(assignments.values()));
// Check that 3 of the splits were rejected, since they're non-local
assertEquals(nonRackLocalSplits.size(), 3);
// Assign rack-local splits
ImmutableSet.Builder<Split> rackLocalSplits = ImmutableSet.builder();
HostAddress dataHost1 = HostAddress.fromParts("data.rack1", 1);
HostAddress dataHost2 = HostAddress.fromParts("data.rack2", 1);
for (int i = 0; i < 6 * 2; i++) {
rackLocalSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(dataHost1)));
}
for (int i = 0; i < 6; i++) {
rackLocalSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(dataHost2)));
}
assignments = nodeSelector.computeAssignments(rackLocalSplits.build(), ImmutableList.copyOf(taskMap.values())).getAssignments();
for (InternalNode node : assignments.keySet()) {
RemoteTask remoteTask = taskMap.get(node);
remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
}
Set<Split> unassigned = Sets.difference(rackLocalSplits.build(), new HashSet<>(assignments.values()));
// Compute the assignments a second time to account for the fact that some splits may not have been assigned due to asynchronous
// loading of the NetworkLocationCache
boolean cacheRefreshed = false;
while (!cacheRefreshed) {
cacheRefreshed = true;
if (locationCache.get(dataHost1).equals(ROOT_LOCATION)) {
cacheRefreshed = false;
}
if (locationCache.get(dataHost2).equals(ROOT_LOCATION)) {
cacheRefreshed = false;
}
MILLISECONDS.sleep(10);
}
assignments = nodeSelector.computeAssignments(unassigned, ImmutableList.copyOf(taskMap.values())).getAssignments();
for (InternalNode node : assignments.keySet()) {
RemoteTask remoteTask = taskMap.get(node);
remoteTask.addSplits(ImmutableMultimap.<PlanNodeId, Split>builder().putAll(new PlanNodeId("sourceId"), assignments.get(node)).build());
}
unassigned = Sets.difference(unassigned, new HashSet<>(assignments.values()));
assertEquals(unassigned.size(), 3);
int rack1 = 0;
int rack2 = 0;
for (Split split : unassigned) {
String rack = topology.locate(split.getPreferredNodes(new ModularHashingNodeProvider(nodeSelector.getAllNodes())).get(0)).getSegments().get(0);
switch(rack) {
case "rack1":
rack1++;
break;
case "rack2":
rack2++;
break;
default:
fail();
}
}
assertEquals(rack1, 2);
assertEquals(rack2, 1);
// Assign local splits
ImmutableSet.Builder<Split> localSplits = ImmutableSet.builder();
localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host1.rack1", 1))));
localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host2.rack1", 1))));
localSplits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote(HostAddress.fromParts("host3.rack2", 1))));
assignments = nodeSelector.computeAssignments(localSplits.build(), ImmutableList.copyOf(taskMap.values())).getAssignments();
assertEquals(assignments.size(), 3);
assertEquals(assignments.keySet().size(), 3);
}
use of com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector in project presto by prestodb.
the class TestNodeScheduler method testAffinityAssignmentWithConsistentHashing.
@Test
public void testAffinityAssignmentWithConsistentHashing() {
NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setNodeSelectionHashStrategy(CONSISTENT_HASHING).setMinVirtualNodeCount(3).setMaxSplitsPerNode(20).setIncludeCoordinator(false).setMaxPendingSplitsPerTask(10);
NodeScheduler nodeScheduler = new NodeScheduler(new LegacyNetworkTopology(), nodeManager, new NodeSelectionStats(), nodeSchedulerConfig, nodeTaskMap, new ThrowingNodeTtlFetcherManager(), new NoOpQueryManager(), new SimpleTtlNodeSelectorConfig());
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 3);
Set<Split> splits = new HashSet<>();
IntStream.range(0, 10).forEach(i -> splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestAffinitySplitRemote(i))));
InternalNode node1 = new InternalNode("other1", URI.create("http://127.0.0.1:11"), NodeVersion.UNKNOWN, false);
InternalNode node2 = new InternalNode("other2", URI.create("http://127.0.0.1:12"), NodeVersion.UNKNOWN, false);
InternalNode node3 = new InternalNode("other3", URI.create("http://127.0.0.1:13"), NodeVersion.UNKNOWN, false);
InternalNode node4 = new InternalNode("other4", URI.create("http://127.0.0.1:14"), NodeVersion.UNKNOWN, false);
// In setup node 1-3 are added to node manager
SplitPlacementResult splitPlacementResult = nodeSelector.computeAssignments(splits, ImmutableList.of());
assertEquals(splitPlacementResult.getAssignments().keySet().size(), 3);
// node1: split 1, 3, 4, 5, 6, 7, 8, 9
Collection<ConnectorSplit> node1Splits = splitPlacementResult.getAssignments().get(node1).stream().map(Split::getConnectorSplit).collect(toImmutableSet());
// node2: 0
Collection<Object> node2Splits = splitPlacementResult.getAssignments().get(node2).stream().map(Split::getConnectorSplit).collect(toImmutableSet());
// node3: split 2
Collection<Object> node3Splits = splitPlacementResult.getAssignments().get(node3).stream().map(Split::getConnectorSplit).collect(toImmutableSet());
// Scheduling the same splits on the same set of nodes should give the same assignment
nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 3);
splitPlacementResult = nodeSelector.computeAssignments(splits, ImmutableList.of());
assertEquals(splitPlacementResult.getAssignments().get(node1).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node1Splits);
assertEquals(splitPlacementResult.getAssignments().get(node2).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node2Splits);
assertEquals(splitPlacementResult.getAssignments().get(node3).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node3Splits);
// Adding node4. Node4 is hashed in between node3 and node1, and all splits previously assigned to node1 are now assigned to node4. Assignment to node2 and node3 should not change.
nodeManager.addNode(CONNECTOR_ID, node4);
nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 3);
splitPlacementResult = nodeSelector.computeAssignments(splits, ImmutableList.of());
assertEquals(splitPlacementResult.getAssignments().keySet().size(), 3);
assertEquals(splitPlacementResult.getAssignments().get(node1), ImmutableList.of());
assertEquals(splitPlacementResult.getAssignments().get(node2).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node2Splits);
assertEquals(splitPlacementResult.getAssignments().get(node3).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node3Splits);
assertEquals(splitPlacementResult.getAssignments().get(node4).stream().map(Split::getConnectorSplit).collect(toImmutableSet()), node1Splits);
}
use of com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector in project presto by prestodb.
the class TestNodeScheduler method testMaxTasksPerStageAddingNewNodes.
@Test
public void testMaxTasksPerStageAddingNewNodes() {
InMemoryNodeManager nodeManager = new InMemoryNodeManager();
NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
TestingTransactionHandle transactionHandle = TestingTransactionHandle.create();
NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setMaxSplitsPerNode(20).setIncludeCoordinator(false).setMaxPendingSplitsPerTask(10);
LegacyNetworkTopology networkTopology = new LegacyNetworkTopology();
// refresh interval is 1 nanosecond
NodeScheduler nodeScheduler = new NodeScheduler(new NetworkLocationCache(networkTopology), networkTopology, nodeManager, new NodeSelectionStats(), nodeSchedulerConfig, nodeTaskMap, Duration.valueOf("0s"), new ThrowingNodeTtlFetcherManager(), new NoOpQueryManager(), new SimpleTtlNodeSelectorConfig());
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, CONNECTOR_ID, 2);
Set<Split> splits = new HashSet<>();
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote()));
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote()));
splits.add(new Split(CONNECTOR_ID, transactionHandle, new TestSplitRemote()));
nodeManager.addNode(CONNECTOR_ID, ImmutableList.of(new InternalNode("node1", URI.create("http://127.0.0.1:11"), NodeVersion.UNKNOWN, false)));
SplitPlacementResult splitPlacementResult = nodeSelector.computeAssignments(splits, ImmutableList.of());
Set<InternalNode> internalNodes = splitPlacementResult.getAssignments().keySet();
assertEquals(internalNodes.size(), 1);
nodeManager.addNode(CONNECTOR_ID, ImmutableList.of(new InternalNode("node2", URI.create("http://127.0.0.1:12"), NodeVersion.UNKNOWN, false)));
splitPlacementResult = nodeSelector.computeAssignments(splits, getRemoteTableScanTask(splitPlacementResult));
Set<InternalNode> internalNodesSecondCall = splitPlacementResult.getAssignments().keySet();
assertEquals(internalNodesSecondCall.size(), 2);
assertTrue(internalNodesSecondCall.containsAll(internalNodes));
nodeManager.addNode(CONNECTOR_ID, ImmutableList.of(new InternalNode("node2", URI.create("http://127.0.0.1:13"), NodeVersion.UNKNOWN, false)));
internalNodes = splitPlacementResult.getAssignments().keySet();
assertEquals(internalNodes.size(), 2);
assertTrue(internalNodesSecondCall.containsAll(internalNodes));
}
Aggregations