use of org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion in project flink by apache.
the class RestartPipelinedRegionFailoverStrategy method getTasksNeedingRestart.
// ------------------------------------------------------------------------
// task failure handling
// ------------------------------------------------------------------------
/**
* Returns a set of IDs corresponding to the set of vertices that should be restarted. In this
* strategy, all task vertices in 'involved' regions are proposed to be restarted. The
* 'involved' regions are calculated with rules below: 1. The region containing the failed task
* is always involved 2. If an input result partition of an involved region is not available,
* i.e. Missing or Corrupted, the region containing the partition producer task is involved 3.
* If a region is involved, all of its consumer regions are involved
*
* @param executionVertexId ID of the failed task
* @param cause cause of the failure
* @return set of IDs of vertices to restart
*/
@Override
public Set<ExecutionVertexID> getTasksNeedingRestart(ExecutionVertexID executionVertexId, Throwable cause) {
LOG.info("Calculating tasks to restart to recover the failed task {}.", executionVertexId);
final SchedulingPipelinedRegion failedRegion = topology.getPipelinedRegionOfVertex(executionVertexId);
if (failedRegion == null) {
// TODO: show the task name in the log
throw new IllegalStateException("Can not find the failover region for task " + executionVertexId, cause);
}
// if the failure cause is data consumption error, mark the corresponding data partition to
// be failed,
// so that the failover process will try to recover it
Optional<PartitionException> dataConsumptionException = ExceptionUtils.findThrowable(cause, PartitionException.class);
if (dataConsumptionException.isPresent()) {
resultPartitionAvailabilityChecker.markResultPartitionFailed(dataConsumptionException.get().getPartitionId().getPartitionId());
}
// calculate the tasks to restart based on the result of regions to restart
Set<ExecutionVertexID> tasksToRestart = new HashSet<>();
for (SchedulingPipelinedRegion region : getRegionsToRestart(failedRegion)) {
for (SchedulingExecutionVertex vertex : region.getVertices()) {
// we do not need to restart tasks which are already in the initial state
if (vertex.getState() != ExecutionState.CREATED) {
tasksToRestart.add(vertex.getId());
}
}
}
// the previous failed partition will be recovered. remove its failed state from the checker
if (dataConsumptionException.isPresent()) {
resultPartitionAvailabilityChecker.removeResultPartitionFromFailedState(dataConsumptionException.get().getPartitionId().getPartitionId());
}
LOG.info("{} tasks should be restarted to recover the failed task {}. ", tasksToRestart.size(), executionVertexId);
return tasksToRestart;
}
use of org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion in project flink by apache.
the class RestartPipelinedRegionFailoverStrategy method getRegionsToRestart.
/**
* All 'involved' regions are proposed to be restarted. The 'involved' regions are calculated
* with rules below: 1. The region containing the failed task is always involved 2. If an input
* result partition of an involved region is not available, i.e. Missing or Corrupted, the
* region containing the partition producer task is involved 3. If a region is involved, all of
* its consumer regions are involved
*/
private Set<SchedulingPipelinedRegion> getRegionsToRestart(SchedulingPipelinedRegion failedRegion) {
Set<SchedulingPipelinedRegion> regionsToRestart = Collections.newSetFromMap(new IdentityHashMap<>());
Set<SchedulingPipelinedRegion> visitedRegions = Collections.newSetFromMap(new IdentityHashMap<>());
Set<ConsumedPartitionGroup> visitedConsumedResultGroups = Collections.newSetFromMap(new IdentityHashMap<>());
Set<ConsumerVertexGroup> visitedConsumerVertexGroups = Collections.newSetFromMap(new IdentityHashMap<>());
// start from the failed region to visit all involved regions
Queue<SchedulingPipelinedRegion> regionsToVisit = new ArrayDeque<>();
visitedRegions.add(failedRegion);
regionsToVisit.add(failedRegion);
while (!regionsToVisit.isEmpty()) {
SchedulingPipelinedRegion regionToRestart = regionsToVisit.poll();
// an involved region should be restarted
regionsToRestart.add(regionToRestart);
// if a needed input result partition is not available, its producer region is involved
for (IntermediateResultPartitionID consumedPartitionId : getConsumedPartitionsToVisit(regionToRestart, visitedConsumedResultGroups)) {
if (!resultPartitionAvailabilityChecker.isAvailable(consumedPartitionId)) {
SchedulingResultPartition consumedPartition = topology.getResultPartition(consumedPartitionId);
SchedulingPipelinedRegion producerRegion = topology.getPipelinedRegionOfVertex(consumedPartition.getProducer().getId());
if (!visitedRegions.contains(producerRegion)) {
visitedRegions.add(producerRegion);
regionsToVisit.add(producerRegion);
}
}
}
// all consumer regions of an involved region should be involved
for (ExecutionVertexID consumerVertexId : getConsumerVerticesToVisit(regionToRestart, visitedConsumerVertexGroups)) {
SchedulingPipelinedRegion consumerRegion = topology.getPipelinedRegionOfVertex(consumerVertexId);
if (!visitedRegions.contains(consumerRegion)) {
visitedRegions.add(consumerRegion);
regionsToVisit.add(consumerRegion);
}
}
}
return regionsToRestart;
}
use of org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion in project flink by apache.
the class DefaultExecutionTopologyTest method testExistingRegionsAreNotAffectedDuringTopologyUpdate.
@Test
public void testExistingRegionsAreNotAffectedDuringTopologyUpdate() throws Exception {
final JobVertex[] jobVertices = createJobVertices(BLOCKING);
executionGraph = createDynamicGraph(jobVertices);
adapter = DefaultExecutionTopology.fromExecutionGraph(executionGraph);
final ExecutionJobVertex ejv1 = executionGraph.getJobVertex(jobVertices[0].getID());
final ExecutionJobVertex ejv2 = executionGraph.getJobVertex(jobVertices[1].getID());
executionGraph.initializeJobVertex(ejv1, 0L);
adapter.notifyExecutionGraphUpdated(executionGraph, Collections.singletonList(ejv1));
SchedulingPipelinedRegion regionOld = adapter.getPipelinedRegionOfVertex(new ExecutionVertexID(ejv1.getJobVertexId(), 0));
executionGraph.initializeJobVertex(ejv2, 0L);
adapter.notifyExecutionGraphUpdated(executionGraph, Collections.singletonList(ejv2));
SchedulingPipelinedRegion regionNew = adapter.getPipelinedRegionOfVertex(new ExecutionVertexID(ejv1.getJobVertexId(), 0));
assertSame(regionOld, regionNew);
}
use of org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion in project flink by apache.
the class RegionPartitionGroupReleaseStrategy method vertexUnfinished.
@Override
public void vertexUnfinished(final ExecutionVertexID executionVertexId) {
final PipelinedRegionExecutionView regionExecutionView = getPipelinedRegionExecutionViewForVertex(executionVertexId);
regionExecutionView.vertexUnfinished(executionVertexId);
final SchedulingPipelinedRegion pipelinedRegion = schedulingTopology.getPipelinedRegionOfVertex(executionVertexId);
consumerRegionGroupExecutionViewMaintainer.regionUnfinished(pipelinedRegion);
}
use of org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion in project flink by apache.
the class RegionPartitionGroupReleaseStrategy method vertexFinished.
@Override
public List<ConsumedPartitionGroup> vertexFinished(final ExecutionVertexID finishedVertex) {
final PipelinedRegionExecutionView regionExecutionView = getPipelinedRegionExecutionViewForVertex(finishedVertex);
regionExecutionView.vertexFinished(finishedVertex);
if (regionExecutionView.isFinished()) {
final SchedulingPipelinedRegion pipelinedRegion = schedulingTopology.getPipelinedRegionOfVertex(finishedVertex);
consumerRegionGroupExecutionViewMaintainer.regionFinished(pipelinedRegion);
return filterReleasablePartitionGroups(pipelinedRegion.getAllBlockingConsumedPartitionGroups());
}
return Collections.emptyList();
}
Aggregations