use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class TestRaptorSplitManager method setup.
@BeforeMethod
public void setup() throws Exception {
FunctionAndTypeManager functionAndTypeManager = createTestFunctionAndTypeManager();
DBI dbi = new DBI("jdbc:h2:mem:test" + System.nanoTime() + "_" + ThreadLocalRandom.current().nextInt());
dbi.registerMapper(new TableColumn.Mapper(functionAndTypeManager));
dummyHandle = dbi.open();
createTablesWithRetry(dbi);
temporary = createTempDir();
AssignmentLimiter assignmentLimiter = new AssignmentLimiter(ImmutableSet::of, systemTicker(), new MetadataConfig());
shardManager = new DatabaseShardManager(dbi, new DaoSupplier<>(dbi, ShardDao.class), ImmutableSet::of, assignmentLimiter, systemTicker(), new Duration(0, MINUTES));
TestingNodeManager nodeManager = new TestingNodeManager();
NodeSupplier nodeSupplier = nodeManager::getWorkerNodes;
String nodeName = UUID.randomUUID().toString();
nodeManager.addNode(new InternalNode(nodeName, new URI("http://127.0.0.1/"), NodeVersion.UNKNOWN, false));
RaptorConnectorId connectorId = new RaptorConnectorId("raptor");
metadata = new RaptorMetadata(connectorId.toString(), dbi, shardManager, createTestFunctionAndTypeManager());
metadata.createTable(SESSION, TEST_TABLE, false);
tableHandle = metadata.getTableHandle(SESSION, TEST_TABLE.getTable());
List<ShardInfo> shards = ImmutableList.<ShardInfo>builder().add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).build();
tableId = ((RaptorTableHandle) tableHandle).getTableId();
List<ColumnInfo> columns = metadata.getColumnHandles(SESSION, tableHandle).values().stream().map(RaptorColumnHandle.class::cast).map(ColumnInfo::fromHandle).collect(toList());
long transactionId = shardManager.beginTransaction();
shardManager.commitShards(transactionId, tableId, columns, shards, Optional.empty(), 0);
raptorSplitManager = new RaptorSplitManager(connectorId, nodeSupplier, shardManager, false);
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class TestPartialResultQueryTaskTracker method testPartialResultQueryTaskTracker.
@Test
public void testPartialResultQueryTaskTracker() throws Exception {
PartialResultQueryTaskTracker tracker = new PartialResultQueryTaskTracker(partialResultQueryManager, 0.50, 2.0, warningCollector);
InternalNode node1 = new InternalNode(UUID.randomUUID().toString(), URI.create("https://192.0.2.8"), new NodeVersion("1"), false, false);
InternalNode node2 = new InternalNode(UUID.randomUUID().toString(), URI.create("https://192.0.2.9"), new NodeVersion("1"), false, false);
TaskId taskId1 = new TaskId("test1", 1, 0, 1);
TaskId taskId2 = new TaskId("test2", 2, 0, 1);
RemoteTask task1 = taskFactory.createTableScanTask(taskId1, node1, ImmutableList.of(), new NodeTaskMap.NodeStatsTracker(delta -> {
}, delta -> {
}, (age, delta) -> {
}));
RemoteTask task2 = taskFactory.createTableScanTask(taskId2, node2, ImmutableList.of(), new NodeTaskMap.NodeStatsTracker(delta -> {
}, delta -> {
}, (age, delta) -> {
}));
tracker.trackTask(task1);
tracker.trackTask(task2);
// Assert that completion ratio is 0.0 since the tasks did not complete yet
assertEquals(0.0, tracker.getTaskCompletionRatio());
tracker.completeTaskScheduling();
tracker.recordTaskFinish(task1.getTaskInfo());
// Assert that completion ratio is 0.5 since we have set that task1 finished in above line
assertEquals(0.5, tracker.getTaskCompletionRatio());
// Assert that the query is added to query manager, queue size = 1 since the query reached minCompletion ratio of 0.5 and is eligible for partial results
assertEquals(1, partialResultQueryManager.getQueueSize());
// Sleep for 2 seconds so that we give enough time for query manager to cancel tasks and complete the query with partial results
Thread.sleep(2000);
assertEquals(0, partialResultQueryManager.getQueueSize());
// Assert that partial result warning is set correctly
assertEquals(1, warningCollector.getWarnings().size());
PrestoWarning prestoWarning = warningCollector.getWarnings().get(0);
// Assert that warning code is set to PARTIAL_RESULT_WARNING
assertEquals(PARTIAL_RESULT_WARNING.toWarningCode(), prestoWarning.getWarningCode());
// Assert that completion percent of 50.00 is specified correctly in the warning message
assertEquals("Partial results are returned. Only 50.00 percent of the data is read.", prestoWarning.getMessage());
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class TestSourcePartitionedScheduler method testBalancedSplitAssignment.
@Test
public void testBalancedSplitAssignment() {
// use private node manager so we can add a node later
InMemoryNodeManager nodeManager = new InMemoryNodeManager();
nodeManager.addNode(CONNECTOR_ID, new InternalNode("other1", URI.create("http://127.0.0.1:11"), NodeVersion.UNKNOWN, false), new InternalNode("other2", URI.create("http://127.0.0.1:12"), NodeVersion.UNKNOWN, false), new InternalNode("other3", URI.create("http://127.0.0.1:13"), NodeVersion.UNKNOWN, false));
NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
// Schedule 15 splits - there are 3 nodes, each node should get 5 splits
SubPlan firstPlan = createPlan();
SqlStageExecution firstStage = createSqlStageExecution(firstPlan, nodeTaskMap);
StageScheduler firstScheduler = getSourcePartitionedScheduler(createFixedSplitSource(15, TestingSplit::createRemoteSplit), firstStage, nodeManager, nodeTaskMap, 200);
ScheduleResult scheduleResult = firstScheduler.schedule();
assertEffectivelyFinished(scheduleResult, firstScheduler);
assertTrue(scheduleResult.getBlocked().isDone());
assertEquals(scheduleResult.getNewTasks().size(), 3);
assertEquals(firstStage.getAllTasks().size(), 3);
for (RemoteTask remoteTask : firstStage.getAllTasks()) {
PartitionedSplitsInfo splitsInfo = remoteTask.getPartitionedSplitsInfo();
assertEquals(splitsInfo.getCount(), 5);
}
// Add new node
InternalNode additionalNode = new InternalNode("other4", URI.create("http://127.0.0.1:14"), NodeVersion.UNKNOWN, false);
nodeManager.addNode(CONNECTOR_ID, additionalNode);
// Schedule 5 splits in another query. Since the new node does not have any splits, all 5 splits are assigned to the new node
SubPlan secondPlan = createPlan();
SqlStageExecution secondStage = createSqlStageExecution(secondPlan, nodeTaskMap);
StageScheduler secondScheduler = getSourcePartitionedScheduler(createFixedSplitSource(5, TestingSplit::createRemoteSplit), secondStage, nodeManager, nodeTaskMap, 200);
scheduleResult = secondScheduler.schedule();
assertEffectivelyFinished(scheduleResult, secondScheduler);
assertTrue(scheduleResult.getBlocked().isDone());
assertEquals(scheduleResult.getNewTasks().size(), 1);
assertEquals(secondStage.getAllTasks().size(), 1);
RemoteTask task = secondStage.getAllTasks().get(0);
assertEquals(task.getPartitionedSplitsInfo().getCount(), 5);
firstStage.abort();
secondStage.abort();
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class SourcePartitionedScheduler method schedule.
@Override
public synchronized ScheduleResult schedule() {
dropListenersFromWhenFinishedOrNewLifespansAdded();
int overallSplitAssignmentCount = 0;
ImmutableSet.Builder<RemoteTask> overallNewTasks = ImmutableSet.builder();
List<ListenableFuture<?>> overallBlockedFutures = new ArrayList<>();
boolean anyBlockedOnPlacements = false;
boolean anyBlockedOnNextSplitBatch = false;
boolean anyNotBlocked = false;
for (Entry<Lifespan, ScheduleGroup> entry : scheduleGroups.entrySet()) {
Lifespan lifespan = entry.getKey();
ScheduleGroup scheduleGroup = entry.getValue();
if (scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS || scheduleGroup.state == ScheduleGroupState.DONE) {
verify(scheduleGroup.nextSplitBatchFuture == null);
} else if (scheduleGroup.pendingSplits.isEmpty()) {
// try to get the next batch
if (scheduleGroup.nextSplitBatchFuture == null) {
scheduleGroup.nextSplitBatchFuture = splitSource.getNextBatch(scheduleGroup.partitionHandle, lifespan, splitBatchSize);
long start = System.nanoTime();
addSuccessCallback(scheduleGroup.nextSplitBatchFuture, () -> stage.recordGetSplitTime(start));
}
if (scheduleGroup.nextSplitBatchFuture.isDone()) {
SplitBatch nextSplits = getFutureValue(scheduleGroup.nextSplitBatchFuture);
scheduleGroup.nextSplitBatchFuture = null;
scheduleGroup.pendingSplits = new HashSet<>(nextSplits.getSplits());
if (nextSplits.isLastBatch()) {
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED && scheduleGroup.pendingSplits.isEmpty()) {
// Add an empty split in case no splits have been produced for the source.
// For source operators, they never take input, but they may produce output.
// This is well handled by Presto execution engine.
// However, there are certain non-source operators that may produce output without any input,
// for example, 1) an AggregationOperator, 2) a HashAggregationOperator where one of the grouping sets is ().
// Scheduling an empty split kicks off necessary driver instantiation to make this work.
scheduleGroup.pendingSplits.add(new Split(splitSource.getConnectorId(), splitSource.getTransactionHandle(), new EmptySplit(splitSource.getConnectorId()), lifespan, NON_CACHEABLE));
}
scheduleGroup.state = ScheduleGroupState.NO_MORE_SPLITS;
}
} else {
overallBlockedFutures.add(scheduleGroup.nextSplitBatchFuture);
anyBlockedOnNextSplitBatch = true;
continue;
}
}
Multimap<InternalNode, Split> splitAssignment = ImmutableMultimap.of();
if (!scheduleGroup.pendingSplits.isEmpty()) {
if (!scheduleGroup.placementFuture.isDone()) {
anyBlockedOnPlacements = true;
continue;
}
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED) {
scheduleGroup.state = ScheduleGroupState.SPLITS_ADDED;
}
if (state == State.INITIALIZED) {
state = State.SPLITS_ADDED;
}
// calculate placements for splits
SplitPlacementResult splitPlacementResult = splitPlacementPolicy.computeAssignments(scheduleGroup.pendingSplits);
splitAssignment = splitPlacementResult.getAssignments();
// remove splits with successful placements
// AbstractSet.removeAll performs terribly here.
splitAssignment.values().forEach(scheduleGroup.pendingSplits::remove);
overallSplitAssignmentCount += splitAssignment.size();
// if not completed placed, mark scheduleGroup as blocked on placement
if (!scheduleGroup.pendingSplits.isEmpty()) {
scheduleGroup.placementFuture = splitPlacementResult.getBlocked();
overallBlockedFutures.add(scheduleGroup.placementFuture);
anyBlockedOnPlacements = true;
}
}
// if no new splits will be assigned, update state and attach completion event
Multimap<InternalNode, Lifespan> noMoreSplitsNotification = ImmutableMultimap.of();
if (scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS) {
scheduleGroup.state = ScheduleGroupState.DONE;
if (!lifespan.isTaskWide()) {
InternalNode node = ((BucketedSplitPlacementPolicy) splitPlacementPolicy).getNodeForBucket(lifespan.getId());
noMoreSplitsNotification = ImmutableMultimap.of(node, lifespan);
}
}
// assign the splits with successful placements
overallNewTasks.addAll(assignSplits(splitAssignment, noMoreSplitsNotification));
// As a result, to avoid busy loops caused by 1, we check pendingSplits.isEmpty() instead of placementFuture.isDone() here.
if (scheduleGroup.nextSplitBatchFuture == null && scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state != ScheduleGroupState.DONE) {
anyNotBlocked = true;
}
}
// (by calling `notifyAllLifespansFinishedExecution`)
if ((state == State.NO_MORE_SPLITS || state == State.FINISHED) || (!groupedExecution && lifespanAdded && scheduleGroups.isEmpty() && splitSource.isFinished())) {
switch(state) {
case INITIALIZED:
// But this shouldn't be possible. See usage of EmptySplit in this method.
throw new IllegalStateException("At least 1 split should have been scheduled for this plan node");
case SPLITS_ADDED:
state = State.NO_MORE_SPLITS;
splitSource.close();
// fall through
case NO_MORE_SPLITS:
state = State.FINISHED;
whenFinishedOrNewLifespanAdded.set(null);
// fall through
case FINISHED:
return ScheduleResult.nonBlocked(true, overallNewTasks.build(), overallSplitAssignmentCount);
default:
throw new IllegalStateException("Unknown state");
}
}
if (anyNotBlocked) {
return ScheduleResult.nonBlocked(false, overallNewTasks.build(), overallSplitAssignmentCount);
}
if (anyBlockedOnPlacements) {
// In a broadcast join, output buffers of the tasks in build source stage have to
// hold onto all data produced before probe side task scheduling finishes,
// even if the data is acknowledged by all known consumers. This is because
// new consumers may be added until the probe side task scheduling finishes.
//
// As a result, the following line is necessary to prevent deadlock
// due to neither build nor probe can make any progress.
// The build side blocks due to a full output buffer.
// In the meantime the probe side split cannot be consumed since
// builder side hash table construction has not finished.
//
// TODO: When SourcePartitionedScheduler is used as a SourceScheduler, it shouldn't need to worry about
// task scheduling and creation -- these are done by the StageScheduler.
overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
}
ScheduleResult.BlockedReason blockedReason;
if (anyBlockedOnNextSplitBatch) {
blockedReason = anyBlockedOnPlacements ? MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE : WAITING_FOR_SOURCE;
} else {
blockedReason = anyBlockedOnPlacements ? SPLIT_QUEUES_FULL : NO_ACTIVE_DRIVER_GROUP;
}
overallBlockedFutures.add(whenFinishedOrNewLifespanAdded);
return ScheduleResult.blocked(false, overallNewTasks.build(), nonCancellationPropagating(whenAnyComplete(overallBlockedFutures)), blockedReason, overallSplitAssignmentCount);
}
use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.
the class SectionExecutionFactory method createStageScheduler.
private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
int maxTasksPerStage = getMaxTasksPerStage(session);
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
return scheduler;
} else {
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no non-replicated remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
if (bucketNodeMap.hasInitialMap()) {
stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
} else {
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
}
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
}
FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
stageExecution.registerStageTaskRecoveryCallback(taskId -> {
checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
fixedSourcePartitionedScheduler.recover(taskId);
});
}
return fixedSourcePartitionedScheduler;
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
return new FixedCountScheduler(stageExecution, partitionToNode);
}
}
}
Aggregations