Search in sources :

Example 1 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class TestRaptorSplitManager method setup.

@BeforeMethod
public void setup() throws Exception {
    FunctionAndTypeManager functionAndTypeManager = createTestFunctionAndTypeManager();
    DBI dbi = new DBI("jdbc:h2:mem:test" + System.nanoTime() + "_" + ThreadLocalRandom.current().nextInt());
    dbi.registerMapper(new TableColumn.Mapper(functionAndTypeManager));
    dummyHandle = dbi.open();
    createTablesWithRetry(dbi);
    temporary = createTempDir();
    AssignmentLimiter assignmentLimiter = new AssignmentLimiter(ImmutableSet::of, systemTicker(), new MetadataConfig());
    shardManager = new DatabaseShardManager(dbi, new DaoSupplier<>(dbi, ShardDao.class), ImmutableSet::of, assignmentLimiter, systemTicker(), new Duration(0, MINUTES));
    TestingNodeManager nodeManager = new TestingNodeManager();
    NodeSupplier nodeSupplier = nodeManager::getWorkerNodes;
    String nodeName = UUID.randomUUID().toString();
    nodeManager.addNode(new InternalNode(nodeName, new URI("http://127.0.0.1/"), NodeVersion.UNKNOWN, false));
    RaptorConnectorId connectorId = new RaptorConnectorId("raptor");
    metadata = new RaptorMetadata(connectorId.toString(), dbi, shardManager, createTestFunctionAndTypeManager());
    metadata.createTable(SESSION, TEST_TABLE, false);
    tableHandle = metadata.getTableHandle(SESSION, TEST_TABLE.getTable());
    List<ShardInfo> shards = ImmutableList.<ShardInfo>builder().add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).add(shardInfo(UUID.randomUUID(), nodeName)).build();
    tableId = ((RaptorTableHandle) tableHandle).getTableId();
    List<ColumnInfo> columns = metadata.getColumnHandles(SESSION, tableHandle).values().stream().map(RaptorColumnHandle.class::cast).map(ColumnInfo::fromHandle).collect(toList());
    long transactionId = shardManager.beginTransaction();
    shardManager.commitShards(transactionId, tableId, columns, shards, Optional.empty(), 0);
    raptorSplitManager = new RaptorSplitManager(connectorId, nodeSupplier, shardManager, false);
}
Also used : RaptorColumnHandle(com.facebook.presto.raptor.RaptorColumnHandle) RaptorMetadata(com.facebook.presto.raptor.RaptorMetadata) DBI(org.skife.jdbi.v2.DBI) Duration(io.airlift.units.Duration) DaoSupplier(com.facebook.presto.raptor.util.DaoSupplier) URI(java.net.URI) FunctionAndTypeManager(com.facebook.presto.metadata.FunctionAndTypeManager) FunctionAndTypeManager.createTestFunctionAndTypeManager(com.facebook.presto.metadata.FunctionAndTypeManager.createTestFunctionAndTypeManager) ImmutableSet(com.google.common.collect.ImmutableSet) TestingNodeManager(com.facebook.presto.testing.TestingNodeManager) InternalNode(com.facebook.presto.metadata.InternalNode) RaptorConnectorId(com.facebook.presto.raptor.RaptorConnectorId) NodeSupplier(com.facebook.presto.raptor.NodeSupplier) RaptorSplitManager(com.facebook.presto.raptor.RaptorSplitManager) BeforeMethod(org.testng.annotations.BeforeMethod)

Example 2 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class TestPartialResultQueryTaskTracker method testPartialResultQueryTaskTracker.

@Test
public void testPartialResultQueryTaskTracker() throws Exception {
    PartialResultQueryTaskTracker tracker = new PartialResultQueryTaskTracker(partialResultQueryManager, 0.50, 2.0, warningCollector);
    InternalNode node1 = new InternalNode(UUID.randomUUID().toString(), URI.create("https://192.0.2.8"), new NodeVersion("1"), false, false);
    InternalNode node2 = new InternalNode(UUID.randomUUID().toString(), URI.create("https://192.0.2.9"), new NodeVersion("1"), false, false);
    TaskId taskId1 = new TaskId("test1", 1, 0, 1);
    TaskId taskId2 = new TaskId("test2", 2, 0, 1);
    RemoteTask task1 = taskFactory.createTableScanTask(taskId1, node1, ImmutableList.of(), new NodeTaskMap.NodeStatsTracker(delta -> {
    }, delta -> {
    }, (age, delta) -> {
    }));
    RemoteTask task2 = taskFactory.createTableScanTask(taskId2, node2, ImmutableList.of(), new NodeTaskMap.NodeStatsTracker(delta -> {
    }, delta -> {
    }, (age, delta) -> {
    }));
    tracker.trackTask(task1);
    tracker.trackTask(task2);
    // Assert that completion ratio is 0.0 since the tasks did not complete yet
    assertEquals(0.0, tracker.getTaskCompletionRatio());
    tracker.completeTaskScheduling();
    tracker.recordTaskFinish(task1.getTaskInfo());
    // Assert that completion ratio is 0.5 since we have set that task1 finished in above line
    assertEquals(0.5, tracker.getTaskCompletionRatio());
    // Assert that the query is added to query manager, queue size = 1 since the query reached minCompletion ratio of 0.5 and is eligible for partial results
    assertEquals(1, partialResultQueryManager.getQueueSize());
    // Sleep for 2 seconds so that we give enough time for query manager to cancel tasks and complete the query with partial results
    Thread.sleep(2000);
    assertEquals(0, partialResultQueryManager.getQueueSize());
    // Assert that partial result warning is set correctly
    assertEquals(1, warningCollector.getWarnings().size());
    PrestoWarning prestoWarning = warningCollector.getWarnings().get(0);
    // Assert that warning code is set to PARTIAL_RESULT_WARNING
    assertEquals(PARTIAL_RESULT_WARNING.toWarningCode(), prestoWarning.getWarningCode());
    // Assert that completion percent of 50.00 is specified correctly in the warning message
    assertEquals("Partial results are returned. Only 50.00 percent of the data is read.", prestoWarning.getMessage());
}
Also used : NodeVersion(com.facebook.presto.client.NodeVersion) WarningCollector(com.facebook.presto.spi.WarningCollector) AfterClass(org.testng.annotations.AfterClass) NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) PARTIAL_RESULT_WARNING(com.facebook.presto.spi.StandardWarningCode.PARTIAL_RESULT_WARNING) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) NodeVersion(com.facebook.presto.client.NodeVersion) UUID(java.util.UUID) NORMAL(com.facebook.presto.execution.warnings.WarningHandlingLevel.NORMAL) MockRemoteTaskFactory(com.facebook.presto.execution.MockRemoteTaskFactory) InternalNode(com.facebook.presto.metadata.InternalNode) Threads.daemonThreadsNamed(com.facebook.airlift.concurrent.Threads.daemonThreadsNamed) RemoteTask(com.facebook.presto.execution.RemoteTask) WarningCollectorConfig(com.facebook.presto.execution.warnings.WarningCollectorConfig) ImmutableList(com.google.common.collect.ImmutableList) Executors.newCachedThreadPool(java.util.concurrent.Executors.newCachedThreadPool) Executors.newScheduledThreadPool(java.util.concurrent.Executors.newScheduledThreadPool) TaskId(com.facebook.presto.execution.TaskId) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) PrestoWarning(com.facebook.presto.spi.PrestoWarning) DefaultWarningCollector(com.facebook.presto.execution.warnings.DefaultWarningCollector) URI(java.net.URI) ExecutorService(java.util.concurrent.ExecutorService) PartialResultQueryManager(com.facebook.presto.execution.PartialResultQueryManager) TaskId(com.facebook.presto.execution.TaskId) NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) PrestoWarning(com.facebook.presto.spi.PrestoWarning) RemoteTask(com.facebook.presto.execution.RemoteTask) InternalNode(com.facebook.presto.metadata.InternalNode) Test(org.testng.annotations.Test)

Example 3 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class TestSourcePartitionedScheduler method testBalancedSplitAssignment.

@Test
public void testBalancedSplitAssignment() {
    // use private node manager so we can add a node later
    InMemoryNodeManager nodeManager = new InMemoryNodeManager();
    nodeManager.addNode(CONNECTOR_ID, new InternalNode("other1", URI.create("http://127.0.0.1:11"), NodeVersion.UNKNOWN, false), new InternalNode("other2", URI.create("http://127.0.0.1:12"), NodeVersion.UNKNOWN, false), new InternalNode("other3", URI.create("http://127.0.0.1:13"), NodeVersion.UNKNOWN, false));
    NodeTaskMap nodeTaskMap = new NodeTaskMap(finalizerService);
    // Schedule 15 splits - there are 3 nodes, each node should get 5 splits
    SubPlan firstPlan = createPlan();
    SqlStageExecution firstStage = createSqlStageExecution(firstPlan, nodeTaskMap);
    StageScheduler firstScheduler = getSourcePartitionedScheduler(createFixedSplitSource(15, TestingSplit::createRemoteSplit), firstStage, nodeManager, nodeTaskMap, 200);
    ScheduleResult scheduleResult = firstScheduler.schedule();
    assertEffectivelyFinished(scheduleResult, firstScheduler);
    assertTrue(scheduleResult.getBlocked().isDone());
    assertEquals(scheduleResult.getNewTasks().size(), 3);
    assertEquals(firstStage.getAllTasks().size(), 3);
    for (RemoteTask remoteTask : firstStage.getAllTasks()) {
        PartitionedSplitsInfo splitsInfo = remoteTask.getPartitionedSplitsInfo();
        assertEquals(splitsInfo.getCount(), 5);
    }
    // Add new node
    InternalNode additionalNode = new InternalNode("other4", URI.create("http://127.0.0.1:14"), NodeVersion.UNKNOWN, false);
    nodeManager.addNode(CONNECTOR_ID, additionalNode);
    // Schedule 5 splits in another query. Since the new node does not have any splits, all 5 splits are assigned to the new node
    SubPlan secondPlan = createPlan();
    SqlStageExecution secondStage = createSqlStageExecution(secondPlan, nodeTaskMap);
    StageScheduler secondScheduler = getSourcePartitionedScheduler(createFixedSplitSource(5, TestingSplit::createRemoteSplit), secondStage, nodeManager, nodeTaskMap, 200);
    scheduleResult = secondScheduler.schedule();
    assertEffectivelyFinished(scheduleResult, secondScheduler);
    assertTrue(scheduleResult.getBlocked().isDone());
    assertEquals(scheduleResult.getNewTasks().size(), 1);
    assertEquals(secondStage.getAllTasks().size(), 1);
    RemoteTask task = secondStage.getAllTasks().get(0);
    assertEquals(task.getPartitionedSplitsInfo().getCount(), 5);
    firstStage.abort();
    secondStage.abort();
}
Also used : NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) PartitionedSplitsInfo(com.facebook.presto.execution.PartitionedSplitsInfo) MockRemoteTask(com.facebook.presto.execution.MockRemoteTaskFactory.MockRemoteTask) RemoteTask(com.facebook.presto.execution.RemoteTask) InternalNode(com.facebook.presto.metadata.InternalNode) SubPlan(com.facebook.presto.sql.planner.SubPlan) SqlStageExecution(com.facebook.presto.execution.SqlStageExecution) InMemoryNodeManager(com.facebook.presto.metadata.InMemoryNodeManager) SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler(com.facebook.presto.execution.scheduler.SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler) Test(org.testng.annotations.Test)

Example 4 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class SourcePartitionedScheduler method schedule.

@Override
public synchronized ScheduleResult schedule() {
    dropListenersFromWhenFinishedOrNewLifespansAdded();
    int overallSplitAssignmentCount = 0;
    ImmutableSet.Builder<RemoteTask> overallNewTasks = ImmutableSet.builder();
    List<ListenableFuture<?>> overallBlockedFutures = new ArrayList<>();
    boolean anyBlockedOnPlacements = false;
    boolean anyBlockedOnNextSplitBatch = false;
    boolean anyNotBlocked = false;
    for (Entry<Lifespan, ScheduleGroup> entry : scheduleGroups.entrySet()) {
        Lifespan lifespan = entry.getKey();
        ScheduleGroup scheduleGroup = entry.getValue();
        if (scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS || scheduleGroup.state == ScheduleGroupState.DONE) {
            verify(scheduleGroup.nextSplitBatchFuture == null);
        } else if (scheduleGroup.pendingSplits.isEmpty()) {
            // try to get the next batch
            if (scheduleGroup.nextSplitBatchFuture == null) {
                scheduleGroup.nextSplitBatchFuture = splitSource.getNextBatch(scheduleGroup.partitionHandle, lifespan, splitBatchSize);
                long start = System.nanoTime();
                addSuccessCallback(scheduleGroup.nextSplitBatchFuture, () -> stage.recordGetSplitTime(start));
            }
            if (scheduleGroup.nextSplitBatchFuture.isDone()) {
                SplitBatch nextSplits = getFutureValue(scheduleGroup.nextSplitBatchFuture);
                scheduleGroup.nextSplitBatchFuture = null;
                scheduleGroup.pendingSplits = new HashSet<>(nextSplits.getSplits());
                if (nextSplits.isLastBatch()) {
                    if (scheduleGroup.state == ScheduleGroupState.INITIALIZED && scheduleGroup.pendingSplits.isEmpty()) {
                        // Add an empty split in case no splits have been produced for the source.
                        // For source operators, they never take input, but they may produce output.
                        // This is well handled by Presto execution engine.
                        // However, there are certain non-source operators that may produce output without any input,
                        // for example, 1) an AggregationOperator, 2) a HashAggregationOperator where one of the grouping sets is ().
                        // Scheduling an empty split kicks off necessary driver instantiation to make this work.
                        scheduleGroup.pendingSplits.add(new Split(splitSource.getConnectorId(), splitSource.getTransactionHandle(), new EmptySplit(splitSource.getConnectorId()), lifespan, NON_CACHEABLE));
                    }
                    scheduleGroup.state = ScheduleGroupState.NO_MORE_SPLITS;
                }
            } else {
                overallBlockedFutures.add(scheduleGroup.nextSplitBatchFuture);
                anyBlockedOnNextSplitBatch = true;
                continue;
            }
        }
        Multimap<InternalNode, Split> splitAssignment = ImmutableMultimap.of();
        if (!scheduleGroup.pendingSplits.isEmpty()) {
            if (!scheduleGroup.placementFuture.isDone()) {
                anyBlockedOnPlacements = true;
                continue;
            }
            if (scheduleGroup.state == ScheduleGroupState.INITIALIZED) {
                scheduleGroup.state = ScheduleGroupState.SPLITS_ADDED;
            }
            if (state == State.INITIALIZED) {
                state = State.SPLITS_ADDED;
            }
            // calculate placements for splits
            SplitPlacementResult splitPlacementResult = splitPlacementPolicy.computeAssignments(scheduleGroup.pendingSplits);
            splitAssignment = splitPlacementResult.getAssignments();
            // remove splits with successful placements
            // AbstractSet.removeAll performs terribly here.
            splitAssignment.values().forEach(scheduleGroup.pendingSplits::remove);
            overallSplitAssignmentCount += splitAssignment.size();
            // if not completed placed, mark scheduleGroup as blocked on placement
            if (!scheduleGroup.pendingSplits.isEmpty()) {
                scheduleGroup.placementFuture = splitPlacementResult.getBlocked();
                overallBlockedFutures.add(scheduleGroup.placementFuture);
                anyBlockedOnPlacements = true;
            }
        }
        // if no new splits will be assigned, update state and attach completion event
        Multimap<InternalNode, Lifespan> noMoreSplitsNotification = ImmutableMultimap.of();
        if (scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS) {
            scheduleGroup.state = ScheduleGroupState.DONE;
            if (!lifespan.isTaskWide()) {
                InternalNode node = ((BucketedSplitPlacementPolicy) splitPlacementPolicy).getNodeForBucket(lifespan.getId());
                noMoreSplitsNotification = ImmutableMultimap.of(node, lifespan);
            }
        }
        // assign the splits with successful placements
        overallNewTasks.addAll(assignSplits(splitAssignment, noMoreSplitsNotification));
        // As a result, to avoid busy loops caused by 1, we check pendingSplits.isEmpty() instead of placementFuture.isDone() here.
        if (scheduleGroup.nextSplitBatchFuture == null && scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state != ScheduleGroupState.DONE) {
            anyNotBlocked = true;
        }
    }
    // (by calling `notifyAllLifespansFinishedExecution`)
    if ((state == State.NO_MORE_SPLITS || state == State.FINISHED) || (!groupedExecution && lifespanAdded && scheduleGroups.isEmpty() && splitSource.isFinished())) {
        switch(state) {
            case INITIALIZED:
                // But this shouldn't be possible. See usage of EmptySplit in this method.
                throw new IllegalStateException("At least 1 split should have been scheduled for this plan node");
            case SPLITS_ADDED:
                state = State.NO_MORE_SPLITS;
                splitSource.close();
            // fall through
            case NO_MORE_SPLITS:
                state = State.FINISHED;
                whenFinishedOrNewLifespanAdded.set(null);
            // fall through
            case FINISHED:
                return ScheduleResult.nonBlocked(true, overallNewTasks.build(), overallSplitAssignmentCount);
            default:
                throw new IllegalStateException("Unknown state");
        }
    }
    if (anyNotBlocked) {
        return ScheduleResult.nonBlocked(false, overallNewTasks.build(), overallSplitAssignmentCount);
    }
    if (anyBlockedOnPlacements) {
        // In a broadcast join, output buffers of the tasks in build source stage have to
        // hold onto all data produced before probe side task scheduling finishes,
        // even if the data is acknowledged by all known consumers. This is because
        // new consumers may be added until the probe side task scheduling finishes.
        // 
        // As a result, the following line is necessary to prevent deadlock
        // due to neither build nor probe can make any progress.
        // The build side blocks due to a full output buffer.
        // In the meantime the probe side split cannot be consumed since
        // builder side hash table construction has not finished.
        // 
        // TODO: When SourcePartitionedScheduler is used as a SourceScheduler, it shouldn't need to worry about
        // task scheduling and creation -- these are done by the StageScheduler.
        overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
    }
    ScheduleResult.BlockedReason blockedReason;
    if (anyBlockedOnNextSplitBatch) {
        blockedReason = anyBlockedOnPlacements ? MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE : WAITING_FOR_SOURCE;
    } else {
        blockedReason = anyBlockedOnPlacements ? SPLIT_QUEUES_FULL : NO_ACTIVE_DRIVER_GROUP;
    }
    overallBlockedFutures.add(whenFinishedOrNewLifespanAdded);
    return ScheduleResult.blocked(false, overallNewTasks.build(), nonCancellationPropagating(whenAnyComplete(overallBlockedFutures)), blockedReason, overallSplitAssignmentCount);
}
Also used : ArrayList(java.util.ArrayList) EmptySplit(com.facebook.presto.split.EmptySplit) RemoteTask(com.facebook.presto.execution.RemoteTask) SplitBatch(com.facebook.presto.split.SplitSource.SplitBatch) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ImmutableSet(com.google.common.collect.ImmutableSet) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) InternalNode(com.facebook.presto.metadata.InternalNode) EmptySplit(com.facebook.presto.split.EmptySplit) Split(com.facebook.presto.metadata.Split) Lifespan(com.facebook.presto.execution.Lifespan) HashSet(java.util.HashSet) BucketedSplitPlacementPolicy(com.facebook.presto.execution.scheduler.FixedSourcePartitionedScheduler.BucketedSplitPlacementPolicy)

Example 5 with InternalNode

use of com.facebook.presto.metadata.InternalNode in project presto by prestodb.

the class SectionExecutionFactory method createStageScheduler.

private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
    Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
    int maxTasksPerStage = getMaxTasksPerStage(session);
    if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
        // nodes are selected dynamically based on the constraints of the splits and the system load
        Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
        PlanNodeId planNodeId = entry.getKey();
        SplitSource splitSource = entry.getValue();
        ConnectorId connectorId = splitSource.getConnectorId();
        if (isInternalSystemConnector(connectorId)) {
            connectorId = null;
        }
        NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
        SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
        checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
        return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
    } else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
        Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
        Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
        ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
        whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
        return scheduler;
    } else {
        if (!splitSources.isEmpty()) {
            // contains local source
            List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
            ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
            List<ConnectorPartitionHandle> connectorPartitionHandles;
            boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
            if (groupedExecutionForStage) {
                connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
                checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
            } else {
                connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
            }
            BucketNodeMap bucketNodeMap;
            List<InternalNode> stageNodeList;
            if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
                // no non-replicated remote source
                boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
                bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
                // verify execution is consistent with planner's decision on dynamic lifespan schedule
                verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
                if (bucketNodeMap.hasInitialMap()) {
                    stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
                } else {
                    stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
                }
            } else {
                // cannot use dynamic lifespan schedule
                verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
                // remote source requires nodePartitionMap
                NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
                if (groupedExecutionForStage) {
                    checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
                }
                stageNodeList = nodePartitionMap.getPartitionToNode();
                bucketNodeMap = nodePartitionMap.asBucketNodeMap();
            }
            FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
            if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
                stageExecution.registerStageTaskRecoveryCallback(taskId -> {
                    checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
                    checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
                    checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
                    parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
                    fixedSourcePartitionedScheduler.recover(taskId);
                });
            }
            return fixedSourcePartitionedScheduler;
        } else {
            // all sources are remote
            NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
            List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
            // todo this should asynchronously wait a standard timeout period before failing
            checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
            return new FixedCountScheduler(stageExecution, partitionToNode);
        }
    }
}
Also used : NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) TaskStatus(com.facebook.presto.execution.TaskStatus) ForScheduler(com.facebook.presto.operator.ForScheduler) RemoteSourceNode(com.facebook.presto.sql.planner.plan.RemoteSourceNode) REPLICATE(com.facebook.presto.sql.planner.plan.ExchangeNode.Type.REPLICATE) SplitSourceFactory(com.facebook.presto.sql.planner.SplitSourceFactory) SettableFuture(com.google.common.util.concurrent.SettableFuture) SqlStageExecution(com.facebook.presto.execution.SqlStageExecution) NOT_PARTITIONED(com.facebook.presto.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) SqlStageExecution.createSqlStageExecution(com.facebook.presto.execution.SqlStageExecution.createSqlStageExecution) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Map(java.util.Map) SystemSessionProperties.getConcurrentLifespansPerNode(com.facebook.presto.SystemSessionProperties.getConcurrentLifespansPerNode) SystemSessionProperties.isOptimizedScaleWriterProducerBuffer(com.facebook.presto.SystemSessionProperties.isOptimizedScaleWriterProducerBuffer) QueryManagerConfig(com.facebook.presto.execution.QueryManagerConfig) Collectors.toSet(java.util.stream.Collectors.toSet) SplitSource(com.facebook.presto.split.SplitSource) RemoteTaskFactory(com.facebook.presto.execution.RemoteTaskFactory) ImmutableSet(com.google.common.collect.ImmutableSet) Predicate(java.util.function.Predicate) SystemSessionProperties.getWriterMinSize(com.facebook.presto.SystemSessionProperties.getWriterMinSize) Collection(java.util.Collection) TableWriteInfo.createTableWriteInfo(com.facebook.presto.execution.scheduler.TableWriteInfo.createTableWriteInfo) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) NO_NODES_AVAILABLE(com.facebook.presto.spi.StandardErrorCode.NO_NODES_AVAILABLE) Iterables.getLast(com.google.common.collect.Iterables.getLast) NodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector) SOURCE_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.SOURCE_DISTRIBUTION) SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler(com.facebook.presto.execution.scheduler.SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler) Preconditions.checkState(com.google.common.base.Preconditions.checkState) MoreExecutors.directExecutor(com.google.common.util.concurrent.MoreExecutors.directExecutor) List(java.util.List) Optional(java.util.Optional) StageExecutionId(com.facebook.presto.execution.StageExecutionId) ConnectorId(com.facebook.presto.spi.ConnectorId) ConnectorId.isInternalSystemConnector(com.facebook.presto.spi.ConnectorId.isInternalSystemConnector) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) StageId(com.facebook.presto.execution.StageId) OutputBuffers(com.facebook.presto.execution.buffer.OutputBuffers) ConnectorPartitionHandle(com.facebook.presto.spi.connector.ConnectorPartitionHandle) NodePartitionMap(com.facebook.presto.sql.planner.NodePartitionMap) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) HashMap(java.util.HashMap) Function(java.util.function.Function) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) PlanFragmentId(com.facebook.presto.sql.planner.plan.PlanFragmentId) SystemSessionProperties.getMaxTasksPerStage(com.facebook.presto.SystemSessionProperties.getMaxTasksPerStage) StageExecutionState(com.facebook.presto.execution.StageExecutionState) ExecutorService(java.util.concurrent.ExecutorService) Failures.checkCondition(com.facebook.presto.util.Failures.checkCondition) NodePartitioningManager(com.facebook.presto.sql.planner.NodePartitioningManager) Session(com.facebook.presto.Session) Sets.newConcurrentHashSet(com.google.common.collect.Sets.newConcurrentHashSet) SCALED_WRITER_DISTRIBUTION(com.facebook.presto.sql.planner.SystemPartitioningHandle.SCALED_WRITER_DISTRIBUTION) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) PlanNodeSearcher(com.facebook.presto.sql.planner.optimizations.PlanNodeSearcher) InternalNode(com.facebook.presto.metadata.InternalNode) PlanNode(com.facebook.presto.spi.plan.PlanNode) Collectors.toList(java.util.stream.Collectors.toList) RemoteTask(com.facebook.presto.execution.RemoteTask) FailureDetector(com.facebook.presto.failureDetector.FailureDetector) TableScanNode(com.facebook.presto.spi.plan.TableScanNode) PartitioningHandle(com.facebook.presto.sql.planner.PartitioningHandle) ForQueryExecution(com.facebook.presto.execution.ForQueryExecution) Metadata(com.facebook.presto.metadata.Metadata) NodePartitionMap(com.facebook.presto.sql.planner.NodePartitionMap) ArrayList(java.util.ArrayList) RemoteTask(com.facebook.presto.execution.RemoteTask) TaskStatus(com.facebook.presto.execution.TaskStatus) SqlStageExecution(com.facebook.presto.execution.SqlStageExecution) SqlStageExecution.createSqlStageExecution(com.facebook.presto.execution.SqlStageExecution.createSqlStageExecution) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) StageExecutionState(com.facebook.presto.execution.StageExecutionState) Supplier(java.util.function.Supplier) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) NodeSelector(com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector) SplitSource(com.facebook.presto.split.SplitSource) NodeTaskMap(com.facebook.presto.execution.NodeTaskMap) Map(java.util.Map) NodePartitionMap(com.facebook.presto.sql.planner.NodePartitionMap) HashMap(java.util.HashMap) ConnectorId(com.facebook.presto.spi.ConnectorId)

Aggregations

InternalNode (com.facebook.presto.metadata.InternalNode)74 Split (com.facebook.presto.metadata.Split)34 Test (org.testng.annotations.Test)34 ConnectorSplit (com.facebook.presto.spi.ConnectorSplit)25 HashSet (java.util.HashSet)24 ImmutableList (com.google.common.collect.ImmutableList)17 InMemoryNodeManager (com.facebook.presto.metadata.InMemoryNodeManager)14 SplitPlacementResult (com.facebook.presto.execution.scheduler.SplitPlacementResult)13 NodeSelectionStats (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelectionStats)12 NodeSelector (com.facebook.presto.execution.scheduler.nodeSelection.NodeSelector)12 ImmutableSet (com.google.common.collect.ImmutableSet)12 SimpleTtlNodeSelectorConfig (com.facebook.presto.execution.scheduler.nodeSelection.SimpleTtlNodeSelectorConfig)11 ConnectorId (com.facebook.presto.spi.ConnectorId)11 TestingTransactionHandle (com.facebook.presto.testing.TestingTransactionHandle)11 Duration (io.airlift.units.Duration)11 URI (java.net.URI)11 Map (java.util.Map)11 RemoteTask (com.facebook.presto.execution.RemoteTask)10 NodeScheduler (com.facebook.presto.execution.scheduler.NodeScheduler)10 NodeSchedulerConfig (com.facebook.presto.execution.scheduler.NodeSchedulerConfig)9