Search in sources :

Example 1 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink by apache.

the class EdgeManagerBuildUtil method connectAllToAll.

private static void connectAllToAll(ExecutionVertex[] taskVertices, IntermediateResult intermediateResult) {
    List<IntermediateResultPartitionID> consumedPartitions = Arrays.stream(intermediateResult.getPartitions()).map(IntermediateResultPartition::getPartitionId).collect(Collectors.toList());
    ConsumedPartitionGroup consumedPartitionGroup = createAndRegisterConsumedPartitionGroupToEdgeManager(consumedPartitions, intermediateResult);
    for (ExecutionVertex ev : taskVertices) {
        ev.addConsumedPartitionGroup(consumedPartitionGroup);
    }
    List<ExecutionVertexID> consumerVertices = Arrays.stream(taskVertices).map(ExecutionVertex::getID).collect(Collectors.toList());
    ConsumerVertexGroup consumerVertexGroup = ConsumerVertexGroup.fromMultipleVertices(consumerVertices);
    for (IntermediateResultPartition partition : intermediateResult.getPartitions()) {
        partition.addConsumers(consumerVertexGroup);
    }
}
Also used : ConsumedPartitionGroup(org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ConsumerVertexGroup(org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)

Example 2 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink by apache.

the class Execution method updatePartitionConsumers.

private void updatePartitionConsumers(final IntermediateResultPartition partition) {
    final Optional<ConsumerVertexGroup> consumerVertexGroup = partition.getConsumerVertexGroupOptional();
    if (!consumerVertexGroup.isPresent()) {
        return;
    }
    for (ExecutionVertexID consumerVertexId : consumerVertexGroup.get()) {
        final ExecutionVertex consumerVertex = vertex.getExecutionGraphAccessor().getExecutionVertexOrThrow(consumerVertexId);
        final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
        final ExecutionState consumerState = consumer.getState();
        // ----------------------------------------------------------------
        if (consumerState == DEPLOYING || consumerState == RUNNING || consumerState == INITIALIZING) {
            final PartitionInfo partitionInfo = createPartitionInfo(partition);
            if (consumerState == DEPLOYING) {
                consumerVertex.cachePartitionInfo(partitionInfo);
            } else {
                consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(partitionInfo));
            }
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ConsumerVertexGroup(org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup)

Example 3 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink by apache.

the class DefaultExecutionGraph method maybeReleasePartitionGroupsFor.

private void maybeReleasePartitionGroupsFor(final Execution attempt) {
    final ExecutionVertexID finishedExecutionVertex = attempt.getVertex().getID();
    if (attempt.getState() == ExecutionState.FINISHED) {
        final List<ConsumedPartitionGroup> releasablePartitionGroups = partitionGroupReleaseStrategy.vertexFinished(finishedExecutionVertex);
        releasePartitionGroups(releasablePartitionGroups);
    } else {
        partitionGroupReleaseStrategy.vertexUnfinished(finishedExecutionVertex);
    }
}
Also used : ConsumedPartitionGroup(org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID)

Example 4 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink by apache.

the class RestartPipelinedRegionFailoverStrategy method getTasksNeedingRestart.

// ------------------------------------------------------------------------
// task failure handling
// ------------------------------------------------------------------------
/**
 * Returns a set of IDs corresponding to the set of vertices that should be restarted. In this
 * strategy, all task vertices in 'involved' regions are proposed to be restarted. The
 * 'involved' regions are calculated with rules below: 1. The region containing the failed task
 * is always involved 2. If an input result partition of an involved region is not available,
 * i.e. Missing or Corrupted, the region containing the partition producer task is involved 3.
 * If a region is involved, all of its consumer regions are involved
 *
 * @param executionVertexId ID of the failed task
 * @param cause cause of the failure
 * @return set of IDs of vertices to restart
 */
@Override
public Set<ExecutionVertexID> getTasksNeedingRestart(ExecutionVertexID executionVertexId, Throwable cause) {
    LOG.info("Calculating tasks to restart to recover the failed task {}.", executionVertexId);
    final SchedulingPipelinedRegion failedRegion = topology.getPipelinedRegionOfVertex(executionVertexId);
    if (failedRegion == null) {
        // TODO: show the task name in the log
        throw new IllegalStateException("Can not find the failover region for task " + executionVertexId, cause);
    }
    // if the failure cause is data consumption error, mark the corresponding data partition to
    // be failed,
    // so that the failover process will try to recover it
    Optional<PartitionException> dataConsumptionException = ExceptionUtils.findThrowable(cause, PartitionException.class);
    if (dataConsumptionException.isPresent()) {
        resultPartitionAvailabilityChecker.markResultPartitionFailed(dataConsumptionException.get().getPartitionId().getPartitionId());
    }
    // calculate the tasks to restart based on the result of regions to restart
    Set<ExecutionVertexID> tasksToRestart = new HashSet<>();
    for (SchedulingPipelinedRegion region : getRegionsToRestart(failedRegion)) {
        for (SchedulingExecutionVertex vertex : region.getVertices()) {
            // we do not need to restart tasks which are already in the initial state
            if (vertex.getState() != ExecutionState.CREATED) {
                tasksToRestart.add(vertex.getId());
            }
        }
    }
    // the previous failed partition will be recovered. remove its failed state from the checker
    if (dataConsumptionException.isPresent()) {
        resultPartitionAvailabilityChecker.removeResultPartitionFromFailedState(dataConsumptionException.get().getPartitionId().getPartitionId());
    }
    LOG.info("{} tasks should be restarted to recover the failed task {}. ", tasksToRestart.size(), executionVertexId);
    return tasksToRestart;
}
Also used : SchedulingExecutionVertex(org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) PartitionException(org.apache.flink.runtime.io.network.partition.PartitionException) SchedulingPipelinedRegion(org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion) HashSet(java.util.HashSet)

Example 5 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink by apache.

the class RestartPipelinedRegionFailoverStrategy method getRegionsToRestart.

/**
 * All 'involved' regions are proposed to be restarted. The 'involved' regions are calculated
 * with rules below: 1. The region containing the failed task is always involved 2. If an input
 * result partition of an involved region is not available, i.e. Missing or Corrupted, the
 * region containing the partition producer task is involved 3. If a region is involved, all of
 * its consumer regions are involved
 */
private Set<SchedulingPipelinedRegion> getRegionsToRestart(SchedulingPipelinedRegion failedRegion) {
    Set<SchedulingPipelinedRegion> regionsToRestart = Collections.newSetFromMap(new IdentityHashMap<>());
    Set<SchedulingPipelinedRegion> visitedRegions = Collections.newSetFromMap(new IdentityHashMap<>());
    Set<ConsumedPartitionGroup> visitedConsumedResultGroups = Collections.newSetFromMap(new IdentityHashMap<>());
    Set<ConsumerVertexGroup> visitedConsumerVertexGroups = Collections.newSetFromMap(new IdentityHashMap<>());
    // start from the failed region to visit all involved regions
    Queue<SchedulingPipelinedRegion> regionsToVisit = new ArrayDeque<>();
    visitedRegions.add(failedRegion);
    regionsToVisit.add(failedRegion);
    while (!regionsToVisit.isEmpty()) {
        SchedulingPipelinedRegion regionToRestart = regionsToVisit.poll();
        // an involved region should be restarted
        regionsToRestart.add(regionToRestart);
        // if a needed input result partition is not available, its producer region is involved
        for (IntermediateResultPartitionID consumedPartitionId : getConsumedPartitionsToVisit(regionToRestart, visitedConsumedResultGroups)) {
            if (!resultPartitionAvailabilityChecker.isAvailable(consumedPartitionId)) {
                SchedulingResultPartition consumedPartition = topology.getResultPartition(consumedPartitionId);
                SchedulingPipelinedRegion producerRegion = topology.getPipelinedRegionOfVertex(consumedPartition.getProducer().getId());
                if (!visitedRegions.contains(producerRegion)) {
                    visitedRegions.add(producerRegion);
                    regionsToVisit.add(producerRegion);
                }
            }
        }
        // all consumer regions of an involved region should be involved
        for (ExecutionVertexID consumerVertexId : getConsumerVerticesToVisit(regionToRestart, visitedConsumerVertexGroups)) {
            SchedulingPipelinedRegion consumerRegion = topology.getPipelinedRegionOfVertex(consumerVertexId);
            if (!visitedRegions.contains(consumerRegion)) {
                visitedRegions.add(consumerRegion);
                regionsToVisit.add(consumerRegion);
            }
        }
    }
    return regionsToRestart;
}
Also used : ConsumedPartitionGroup(org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup) SchedulingResultPartition(org.apache.flink.runtime.scheduler.strategy.SchedulingResultPartition) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) SchedulingPipelinedRegion(org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion) ConsumerVertexGroup(org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup) ArrayDeque(java.util.ArrayDeque) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)

Aggregations

ExecutionVertexID (org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID)77 Test (org.junit.Test)55 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)21 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)19 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)18 SchedulingExecutionVertex (org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex)17 Set (java.util.Set)16 IntermediateResultPartitionID (org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)15 AdaptiveSchedulerTest (org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest)15 TestingSchedulingExecutionVertex (org.apache.flink.runtime.scheduler.strategy.TestingSchedulingExecutionVertex)15 Collection (java.util.Collection)11 TestingSchedulingTopology (org.apache.flink.runtime.scheduler.strategy.TestingSchedulingTopology)11 HashSet (java.util.HashSet)10 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)10 ArrayList (java.util.ArrayList)9 Map (java.util.Map)9 HashMap (java.util.HashMap)8 List (java.util.List)8 CompletableFuture (java.util.concurrent.CompletableFuture)8 ArchivedExecutionVertex (org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex)7