Search in sources :

Example 1 with MarkerSplit

use of io.prestosql.snapshot.MarkerSplit in project hetu-core by openlookeng.

the class SqlTaskExecution method processMarkerSource.

private TaskSource processMarkerSource(LinkedList<TaskSource> sources) {
    TaskSource source = sources.remove();
    if (source.getSplits().isEmpty()) {
        // Empty source. It can be processed regardless of marker/pending-splits situation.
        return source;
    }
    List<ScheduledSplit> readySplits = getReadySplits(source);
    if (readySplits.isEmpty()) {
        // Source doesn't have any splits that are ready to be processed.
        // Add back the source removed earlier
        sources.addFirst(source);
        return null;
    }
    boolean hasLeft = readySplits.size() < source.getSplits().size();
    if (hasLeft) {
        // Replace removed source with a new one with a subset of the splits. Need to maintain order in the set.
        Set<ScheduledSplit> remainingSplits = Sets.newLinkedHashSet(source.getSplits());
        remainingSplits.removeAll(readySplits);
        sources.addFirst(new TaskSource(source.getPlanNodeId(), remainingSplits, source.getNoMoreSplitsForLifespan(), source.isNoMoreSplits()));
    }
    // Process marker splits by broadcasting it; collect data splits and return them to be scheduled.
    Set<ScheduledSplit> dataSplits = new HashSet<>();
    for (ScheduledSplit split : readySplits) {
        if (split.getSplit().getConnectorSplit() instanceof MarkerSplit) {
            MarkerSplit marker = (MarkerSplit) split.getSplit().getConnectorSplit();
            // It is not meant to be used to capture snapshots. Ignore it.
            if (marker.getSnapshotId() != 0) {
                // All previous sources have been processed fully. Broadcast the marker.
                DriverSplitRunnerFactory partitionedDriverRunnerFactory = driverRunnerFactoriesWithSplitLifeCycle.get(source.getPlanNodeId());
                partitionedDriverRunnerFactory.broadcastMarker(split.getSplit().getLifespan(), marker.toMarkerPage());
            }
        } else {
            dataSplits.add(split);
        }
    }
    // Ensure that no-more-splits-for-lifespan and no-more-splits values are processed if the source is fully processed
    return new TaskSource(source.getPlanNodeId(), dataSplits, hasLeft ? ImmutableSet.of() : source.getNoMoreSplitsForLifespan(), !hasLeft && source.isNoMoreSplits());
}
Also used : MarkerSplit(io.prestosql.snapshot.MarkerSplit) HashSet(java.util.HashSet)

Example 2 with MarkerSplit

use of io.prestosql.snapshot.MarkerSplit in project hetu-core by openlookeng.

the class SourcePartitionedScheduler method schedule.

@Override
public synchronized ScheduleResult schedule(int maxSplitGroup) {
    dropListenersFromWhenFinishedOrNewLifespansAdded();
    int overallSplitAssignmentCount = 0;
    ImmutableSet.Builder<RemoteTask> overallNewTasks = ImmutableSet.builder();
    List<ListenableFuture<?>> overallBlockedFutures = new ArrayList<>();
    boolean anyBlockedOnPlacements = false;
    boolean anyBlockedOnNextSplitBatch = false;
    boolean anyNotBlocked = false;
    boolean applyFilter = isHeuristicIndexFilterEnabled(session) && SplitFiltering.isSplitFilterApplicable(stage);
    boolean initialMarker = false;
    for (Entry<Lifespan, ScheduleGroup> entry : scheduleGroups.entrySet()) {
        Lifespan lifespan = entry.getKey();
        ScheduleGroup scheduleGroup = entry.getValue();
        Set<Split> pendingSplits = scheduleGroup.pendingSplits;
        if (scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS || scheduleGroup.state == ScheduleGroupState.DONE) {
            verify(scheduleGroup.nextSplitBatchFuture == null);
        } else if (pendingSplits.isEmpty()) {
            // try to get the next batch
            if (scheduleGroup.nextSplitBatchFuture == null) {
                scheduleGroup.nextSplitBatchFuture = splitSource.getNextBatch(scheduleGroup.partitionHandle, lifespan, splitBatchSize - pendingSplits.size());
                long start = System.nanoTime();
                addSuccessCallback(scheduleGroup.nextSplitBatchFuture, () -> stage.recordGetSplitTime(start));
            }
            if (scheduleGroup.nextSplitBatchFuture.isDone()) {
                SplitBatch nextSplits = getFutureValue(scheduleGroup.nextSplitBatchFuture);
                scheduleGroup.nextSplitBatchFuture = null;
                // add split filter to filter out split has no valid rows
                Pair<Optional<RowExpression>, Map<Symbol, ColumnHandle>> pair = SplitFiltering.getExpression(stage);
                if (SystemSessionProperties.isSnapshotEnabled(session)) {
                    List<Split> batchSplits = nextSplits.getSplits();
                    // Don't apply filter to MarkerSplit
                    if (batchSplits.size() == 1 && batchSplits.get(0).getConnectorSplit() instanceof MarkerSplit) {
                        applyFilter = false;
                    }
                }
                List<Split> filteredSplit = applyFilter ? SplitFiltering.getFilteredSplit(pair.getFirst(), SplitFiltering.getFullyQualifiedName(stage), pair.getSecond(), nextSplits, heuristicIndexerManager) : nextSplits.getSplits();
                // In case of ORC small size files/splits are grouped
                List<Split> groupedSmallFilesList = splitSource.groupSmallSplits(filteredSplit, lifespan, maxSplitGroup);
                filteredSplit = groupedSmallFilesList;
                pendingSplits.addAll(filteredSplit);
                if (nextSplits.isLastBatch()) {
                    if (scheduleGroup.state == ScheduleGroupState.INITIALIZED && pendingSplits.isEmpty()) {
                        // Add an empty split in case no splits have been produced for the source.
                        // For source operators, they never take input, but they may produce output.
                        // This is well handled by Presto execution engine.
                        // However, there are certain non-source operators that may produce output without any input,
                        // for example, 1) an AggregationOperator, 2) a HashAggregationOperator where one of the grouping sets is ().
                        // Scheduling an empty split kicks off necessary driver instantiation to make this work.
                        pendingSplits.add(new Split(splitSource.getCatalogName(), new EmptySplit(splitSource.getCatalogName()), lifespan));
                    }
                    scheduleGroup.state = ScheduleGroupState.NO_MORE_SPLITS;
                }
            } else {
                overallBlockedFutures.add(scheduleGroup.nextSplitBatchFuture);
                anyBlockedOnNextSplitBatch = true;
                continue;
            }
        }
        Multimap<InternalNode, Split> splitAssignment = ImmutableMultimap.of();
        if (!pendingSplits.isEmpty()) {
            if (!scheduleGroup.placementFuture.isDone()) {
                anyBlockedOnPlacements = true;
                continue;
            }
            if (scheduleGroup.state == ScheduleGroupState.INITIALIZED) {
                scheduleGroup.state = ScheduleGroupState.SPLITS_ADDED;
            }
            if (state == State.INITIALIZED) {
                state = State.SPLITS_ADDED;
            }
            // calculate placements for splits
            SplitPlacementResult splitPlacementResult;
            if (stage.isThrottledSchedule()) {
                // If asked for partial schedule incase of lesser resource, then schedule only 10% of splits.
                // 10% is calculated on initial number of splits and same is being used on subsequent schedule also.
                // But if later 10% of current pending splits more than earlier 10%, then it will schedule max of
                // these.
                // if throttledSplitsCount is more than number of pendingSplits, then it will schedule all.
                throttledSplitsCount = Math.max((int) Math.ceil(pendingSplits.size() * ALLOWED_PERCENT_LIMIT), throttledSplitsCount);
                splitPlacementResult = splitPlacementPolicy.computeAssignments(ImmutableSet.copyOf(Iterables.limit(pendingSplits, throttledSplitsCount)), this.stage);
            } else {
                splitPlacementResult = splitPlacementPolicy.computeAssignments(new HashSet<>(pendingSplits), this.stage);
            }
            splitAssignment = splitPlacementResult.getAssignments();
            if (SystemSessionProperties.isSnapshotEnabled(session)) {
                Split firstSplit = pendingSplits.iterator().next();
                if (pendingSplits.size() == 1 && firstSplit.getConnectorSplit() instanceof MarkerSplit) {
                    // We'll create a new assignment, but still need to call computeAssignments above, and cannot modify the returned assignment map directly
                    splitAssignment = HashMultimap.create(splitAssignment);
                    splitAssignment.values().remove(firstSplit);
                    // Getting all internalNodes and assigning marker splits to all of them.
                    List<InternalNode> allNodes = splitPlacementPolicy.allNodes();
                    for (InternalNode node : allNodes) {
                        splitAssignment.put(node, firstSplit);
                    }
                    MarkerSplit markerSplit = (MarkerSplit) firstSplit.getConnectorSplit();
                    // then set the flag below to true, so stages enter SCHEDULING_SPLITS state.
                    if (markerSplit.isResuming() || markerSplit.getSnapshotId() == 0) {
                        initialMarker = true;
                    }
                } else {
                    // MarkerSplit should be in its own batch.
                    verify(pendingSplits.stream().noneMatch(split -> split.getConnectorSplit() instanceof MarkerSplit));
                }
            }
            // remove splits with successful placements
            // AbstractSet.removeAll performs terribly here.
            splitAssignment.values().forEach(pendingSplits::remove);
            overallSplitAssignmentCount += splitAssignment.size();
            // if not completed placed, mark scheduleGroup as blocked on placement
            if (!pendingSplits.isEmpty()) {
                scheduleGroup.placementFuture = splitPlacementResult.getBlocked();
                overallBlockedFutures.add(scheduleGroup.placementFuture);
                anyBlockedOnPlacements = true;
            }
        }
        // if no new splits will be assigned, update state and attach completion event
        Multimap<InternalNode, Lifespan> noMoreSplitsNotification = ImmutableMultimap.of();
        if (pendingSplits.isEmpty() && scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS) {
            scheduleGroup.state = ScheduleGroupState.DONE;
            if (!lifespan.isTaskWide()) {
                InternalNode node = ((BucketedSplitPlacementPolicy) splitPlacementPolicy).getNodeForBucket(lifespan.getId());
                noMoreSplitsNotification = ImmutableMultimap.of(node, lifespan);
            }
        }
        // assign the splits with successful placements
        overallNewTasks.addAll(assignSplits(splitAssignment, noMoreSplitsNotification));
        // As a result, to avoid busy loops caused by 1, we check pendingSplits.isEmpty() instead of placementFuture.isDone() here.
        if (scheduleGroup.nextSplitBatchFuture == null && scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state != ScheduleGroupState.DONE) {
            anyNotBlocked = true;
        }
    }
    // Next time it invokes getNextBatch, it will realize that. However, the invocation will fail we tear down splitSource now.
    if ((state == State.NO_MORE_SPLITS || state == State.FINISHED) || (noMoreScheduleGroups && scheduleGroups.isEmpty() && splitSource.isFinished())) {
        switch(state) {
            case INITIALIZED:
                // But this shouldn't be possible. See usage of EmptySplit in this method.
                throw new IllegalStateException("At least 1 split should have been scheduled for this plan node");
            case SPLITS_ADDED:
                state = State.NO_MORE_SPLITS;
                splitSource.close();
            // fall through
            case NO_MORE_SPLITS:
                state = State.FINISHED;
                whenFinishedOrNewLifespanAdded.set(null);
            // fall through
            case FINISHED:
                return new ScheduleResult(true, overallNewTasks.build(), overallSplitAssignmentCount);
            default:
                throw new IllegalStateException("Unknown state");
        }
    }
    if (anyNotBlocked) {
        if (initialMarker) {
            stage.transitionToSchedulingSplits();
        }
        return new ScheduleResult(false, overallNewTasks.build(), overallSplitAssignmentCount);
    }
    if (anyBlockedOnPlacements || groupedExecution) {
        // In a broadcast join, output buffers of the tasks in build source stage have to
        // hold onto all data produced before probe side task scheduling finishes,
        // even if the data is acknowledged by all known consumers. This is because
        // new consumers may be added until the probe side task scheduling finishes.
        // 
        // As a result, the following line is necessary to prevent deadlock
        // due to neither build nor probe can make any progress.
        // The build side blocks due to a full output buffer.
        // In the meantime the probe side split cannot be consumed since
        // builder side hash table construction has not finished.
        overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
    }
    ScheduleResult.BlockedReason blockedReason;
    if (anyBlockedOnNextSplitBatch) {
        blockedReason = anyBlockedOnPlacements ? MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE : WAITING_FOR_SOURCE;
    } else {
        blockedReason = anyBlockedOnPlacements ? SPLIT_QUEUES_FULL : NO_ACTIVE_DRIVER_GROUP;
    }
    overallBlockedFutures.add(whenFinishedOrNewLifespanAdded);
    return new ScheduleResult(false, overallNewTasks.build(), nonCancellationPropagating(whenAnyComplete(overallBlockedFutures)), blockedReason, overallSplitAssignmentCount);
}
Also used : SystemSessionProperties(io.prestosql.SystemSessionProperties) SettableFuture(com.google.common.util.concurrent.SettableFuture) BucketedSplitPlacementPolicy(io.prestosql.execution.scheduler.FixedSourcePartitionedScheduler.BucketedSplitPlacementPolicy) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE(io.prestosql.execution.scheduler.ScheduleResult.BlockedReason.MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE) HashMultimap(com.google.common.collect.HashMultimap) Map(java.util.Map) EmptySplit(io.prestosql.split.EmptySplit) HeuristicIndexerManager(io.prestosql.heuristicindex.HeuristicIndexerManager) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) SystemSessionProperties.isHeuristicIndexFilterEnabled(io.prestosql.SystemSessionProperties.isHeuristicIndexFilterEnabled) Pair(io.prestosql.spi.heuristicindex.Pair) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) WAITING_FOR_SOURCE(io.prestosql.execution.scheduler.ScheduleResult.BlockedReason.WAITING_FOR_SOURCE) ConnectorPartitionHandle(io.prestosql.spi.connector.ConnectorPartitionHandle) Entry(java.util.Map.Entry) Optional(java.util.Optional) MoreFutures.whenAnyComplete(io.airlift.concurrent.MoreFutures.whenAnyComplete) NOT_PARTITIONED(io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) Iterables(com.google.common.collect.Iterables) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) HashMap(java.util.HashMap) Split(io.prestosql.metadata.Split) Multimap(com.google.common.collect.Multimap) NO_ACTIVE_DRIVER_GROUP(io.prestosql.execution.scheduler.ScheduleResult.BlockedReason.NO_ACTIVE_DRIVER_GROUP) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) MarkerSplit(io.prestosql.snapshot.MarkerSplit) Objects.requireNonNull(java.util.Objects.requireNonNull) Session(io.prestosql.Session) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) SplitSource(io.prestosql.split.SplitSource) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) SPLIT_QUEUES_FULL(io.prestosql.execution.scheduler.ScheduleResult.BlockedReason.SPLIT_QUEUES_FULL) Lifespan(io.prestosql.execution.Lifespan) Symbol(io.prestosql.spi.plan.Symbol) SplitBatch(io.prestosql.split.SplitSource.SplitBatch) Iterator(java.util.Iterator) SplitFiltering(io.prestosql.heuristicindex.SplitFiltering) InternalNode(io.prestosql.metadata.InternalNode) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) Futures(com.google.common.util.concurrent.Futures) Futures.nonCancellationPropagating(com.google.common.util.concurrent.Futures.nonCancellationPropagating) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) RowExpression(io.prestosql.spi.relation.RowExpression) MoreFutures.addSuccessCallback(io.airlift.concurrent.MoreFutures.addSuccessCallback) SqlStageExecution(io.prestosql.execution.SqlStageExecution) RemoteTask(io.prestosql.execution.RemoteTask) Symbol(io.prestosql.spi.plan.Symbol) ArrayList(java.util.ArrayList) SplitBatch(io.prestosql.split.SplitSource.SplitBatch) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Pair(io.prestosql.spi.heuristicindex.Pair) HashSet(java.util.HashSet) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) EmptySplit(io.prestosql.split.EmptySplit) RemoteTask(io.prestosql.execution.RemoteTask) RowExpression(io.prestosql.spi.relation.RowExpression) MarkerSplit(io.prestosql.snapshot.MarkerSplit) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) InternalNode(io.prestosql.metadata.InternalNode) EmptySplit(io.prestosql.split.EmptySplit) Split(io.prestosql.metadata.Split) MarkerSplit(io.prestosql.snapshot.MarkerSplit) Lifespan(io.prestosql.execution.Lifespan) BucketedSplitPlacementPolicy(io.prestosql.execution.scheduler.FixedSourcePartitionedScheduler.BucketedSplitPlacementPolicy)

Example 3 with MarkerSplit

use of io.prestosql.snapshot.MarkerSplit in project hetu-core by openlookeng.

the class IndexSourceOperator method addSplit.

@Override
public Supplier<Optional<UpdatablePageSource>> addSplit(Split split) {
    requireNonNull(split, "split is null");
    // TODO-cp-I38S9O: Operator currently not supported for Snapshot
    if (split.getConnectorSplit() instanceof MarkerSplit) {
        throw new UnsupportedOperationException("Operator doesn't support snapshotting.");
    }
    checkState(source == null, "Index source split already set");
    IndexSplit indexSplit = (IndexSplit) split.getConnectorSplit();
    // Normalize the incoming RecordSet to something that can be consumed by the index
    RecordSet normalizedRecordSet = probeKeyNormalizer.apply(indexSplit.getKeyRecordSet());
    ConnectorPageSource result = index.lookup(normalizedRecordSet);
    source = new PageSourceOperator(result, operatorContext);
    Object splitInfo = split.getInfo();
    if (splitInfo != null) {
        operatorContext.setInfoSupplier(() -> new SplitOperatorInfo(splitInfo));
    }
    return Optional::empty;
}
Also used : MarkerSplit(io.prestosql.snapshot.MarkerSplit) SplitOperatorInfo(io.prestosql.operator.SplitOperatorInfo) RecordSet(io.prestosql.spi.connector.RecordSet) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) PageSourceOperator(io.prestosql.operator.PageSourceOperator)

Aggregations

MarkerSplit (io.prestosql.snapshot.MarkerSplit)3 HashSet (java.util.HashSet)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Verify.verify (com.google.common.base.Verify.verify)1 HashMultimap (com.google.common.collect.HashMultimap)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMultimap (com.google.common.collect.ImmutableMultimap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1 Iterables (com.google.common.collect.Iterables)1 Multimap (com.google.common.collect.Multimap)1 Futures (com.google.common.util.concurrent.Futures)1 Futures.nonCancellationPropagating (com.google.common.util.concurrent.Futures.nonCancellationPropagating)1 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 SettableFuture (com.google.common.util.concurrent.SettableFuture)1 MoreFutures.addSuccessCallback (io.airlift.concurrent.MoreFutures.addSuccessCallback)1 MoreFutures.getFutureValue (io.airlift.concurrent.MoreFutures.getFutureValue)1 MoreFutures.whenAnyComplete (io.airlift.concurrent.MoreFutures.whenAnyComplete)1 Session (io.prestosql.Session)1