Search in sources :

Example 46 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink-mirror by flink-ci.

the class SchedulingPipelinedRegionComputeUtil method buildOutEdgesDesc.

private static List<List<Integer>> buildOutEdgesDesc(final Map<SchedulingExecutionVertex, Set<SchedulingExecutionVertex>> vertexToRegion, final List<Set<SchedulingExecutionVertex>> regionList, final Function<ExecutionVertexID, ? extends SchedulingExecutionVertex> executionVertexRetriever) {
    final Map<Set<SchedulingExecutionVertex>, Integer> regionIndices = new IdentityHashMap<>();
    for (int i = 0; i < regionList.size(); i++) {
        regionIndices.put(regionList.get(i), i);
    }
    final List<List<Integer>> outEdges = new ArrayList<>(regionList.size());
    for (Set<SchedulingExecutionVertex> currentRegion : regionList) {
        final List<Integer> currentRegionOutEdges = new ArrayList<>();
        for (SchedulingExecutionVertex vertex : currentRegion) {
            for (SchedulingResultPartition producedResult : vertex.getProducedResults()) {
                if (!producedResult.getResultType().isReconnectable()) {
                    continue;
                }
                final Optional<ConsumerVertexGroup> consumerVertexGroup = producedResult.getConsumerVertexGroup();
                if (!consumerVertexGroup.isPresent()) {
                    continue;
                }
                for (ExecutionVertexID consumerVertexId : consumerVertexGroup.get()) {
                    SchedulingExecutionVertex consumerVertex = executionVertexRetriever.apply(consumerVertexId);
                    // regions and cannot be merged
                    if (!vertexToRegion.containsKey(consumerVertex)) {
                        break;
                    }
                    if (!currentRegion.contains(consumerVertex)) {
                        currentRegionOutEdges.add(regionIndices.get(vertexToRegion.get(consumerVertex)));
                    }
                }
            }
        }
        outEdges.add(currentRegionOutEdges);
    }
    return outEdges;
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) IdentityHashMap(java.util.IdentityHashMap) ArrayList(java.util.ArrayList) ConsumerVertexGroup(org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup) SchedulingExecutionVertex(org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex) SchedulingResultPartition(org.apache.flink.runtime.scheduler.strategy.SchedulingResultPartition) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ArrayList(java.util.ArrayList) List(java.util.List)

Example 47 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink-mirror by flink-ci.

the class Execution method updatePartitionConsumers.

private void updatePartitionConsumers(final IntermediateResultPartition partition) {
    final Optional<ConsumerVertexGroup> consumerVertexGroup = partition.getConsumerVertexGroupOptional();
    if (!consumerVertexGroup.isPresent()) {
        return;
    }
    for (ExecutionVertexID consumerVertexId : consumerVertexGroup.get()) {
        final ExecutionVertex consumerVertex = vertex.getExecutionGraphAccessor().getExecutionVertexOrThrow(consumerVertexId);
        final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
        final ExecutionState consumerState = consumer.getState();
        // ----------------------------------------------------------------
        if (consumerState == DEPLOYING || consumerState == RUNNING || consumerState == INITIALIZING) {
            final PartitionInfo partitionInfo = createPartitionInfo(partition);
            if (consumerState == DEPLOYING) {
                consumerVertex.cachePartitionInfo(partitionInfo);
            } else {
                consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(partitionInfo));
            }
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ConsumerVertexGroup(org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup)

Example 48 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink-mirror by flink-ci.

the class RestartPipelinedRegionFailoverStrategy method getTasksNeedingRestart.

// ------------------------------------------------------------------------
// task failure handling
// ------------------------------------------------------------------------
/**
 * Returns a set of IDs corresponding to the set of vertices that should be restarted. In this
 * strategy, all task vertices in 'involved' regions are proposed to be restarted. The
 * 'involved' regions are calculated with rules below: 1. The region containing the failed task
 * is always involved 2. If an input result partition of an involved region is not available,
 * i.e. Missing or Corrupted, the region containing the partition producer task is involved 3.
 * If a region is involved, all of its consumer regions are involved
 *
 * @param executionVertexId ID of the failed task
 * @param cause cause of the failure
 * @return set of IDs of vertices to restart
 */
@Override
public Set<ExecutionVertexID> getTasksNeedingRestart(ExecutionVertexID executionVertexId, Throwable cause) {
    LOG.info("Calculating tasks to restart to recover the failed task {}.", executionVertexId);
    final SchedulingPipelinedRegion failedRegion = topology.getPipelinedRegionOfVertex(executionVertexId);
    if (failedRegion == null) {
        // TODO: show the task name in the log
        throw new IllegalStateException("Can not find the failover region for task " + executionVertexId, cause);
    }
    // if the failure cause is data consumption error, mark the corresponding data partition to
    // be failed,
    // so that the failover process will try to recover it
    Optional<PartitionException> dataConsumptionException = ExceptionUtils.findThrowable(cause, PartitionException.class);
    if (dataConsumptionException.isPresent()) {
        resultPartitionAvailabilityChecker.markResultPartitionFailed(dataConsumptionException.get().getPartitionId().getPartitionId());
    }
    // calculate the tasks to restart based on the result of regions to restart
    Set<ExecutionVertexID> tasksToRestart = new HashSet<>();
    for (SchedulingPipelinedRegion region : getRegionsToRestart(failedRegion)) {
        for (SchedulingExecutionVertex vertex : region.getVertices()) {
            // we do not need to restart tasks which are already in the initial state
            if (vertex.getState() != ExecutionState.CREATED) {
                tasksToRestart.add(vertex.getId());
            }
        }
    }
    // the previous failed partition will be recovered. remove its failed state from the checker
    if (dataConsumptionException.isPresent()) {
        resultPartitionAvailabilityChecker.removeResultPartitionFromFailedState(dataConsumptionException.get().getPartitionId().getPartitionId());
    }
    LOG.info("{} tasks should be restarted to recover the failed task {}. ", tasksToRestart.size(), executionVertexId);
    return tasksToRestart;
}
Also used : SchedulingExecutionVertex(org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) PartitionException(org.apache.flink.runtime.io.network.partition.PartitionException) SchedulingPipelinedRegion(org.apache.flink.runtime.scheduler.strategy.SchedulingPipelinedRegion) HashSet(java.util.HashSet)

Example 49 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink-mirror by flink-ci.

the class DefaultPreferredLocationsRetriever method getInputLocationFutures.

private Collection<CompletableFuture<TaskManagerLocation>> getInputLocationFutures(final Set<ExecutionVertexID> producersToIgnore, final Collection<ExecutionVertexID> producers) {
    final Collection<CompletableFuture<TaskManagerLocation>> locationsFutures = new ArrayList<>();
    for (ExecutionVertexID producer : producers) {
        final Optional<CompletableFuture<TaskManagerLocation>> optionalLocationFuture;
        if (!producersToIgnore.contains(producer)) {
            optionalLocationFuture = inputsLocationsRetriever.getTaskManagerLocation(producer);
        } else {
            optionalLocationFuture = Optional.empty();
        }
        optionalLocationFuture.ifPresent(locationsFutures::add);
        // be a long time to wait for all the location futures to complete
        if (locationsFutures.size() > MAX_DISTINCT_LOCATIONS_TO_CONSIDER) {
            return Collections.emptyList();
        }
    }
    return locationsFutures;
}
Also used : CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ArrayList(java.util.ArrayList)

Example 50 with ExecutionVertexID

use of org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID in project flink-mirror by flink-ci.

the class DefaultScheduler method assignResource.

private BiFunction<LogicalSlot, Throwable, LogicalSlot> assignResource(final DeploymentHandle deploymentHandle) {
    final ExecutionVertexVersion requiredVertexVersion = deploymentHandle.getRequiredVertexVersion();
    final ExecutionVertexID executionVertexId = deploymentHandle.getExecutionVertexId();
    return (logicalSlot, throwable) -> {
        if (executionVertexVersioner.isModified(requiredVertexVersion)) {
            if (throwable == null) {
                log.debug("Refusing to assign slot to execution vertex {} because this deployment was " + "superseded by another deployment", executionVertexId);
                releaseSlotIfPresent(logicalSlot);
            }
            return null;
        }
        // a task which is about to cancel in #restartTasksWithDelay(...)
        if (throwable != null) {
            throw new CompletionException(maybeWrapWithNoResourceAvailableException(throwable));
        }
        final ExecutionVertex executionVertex = getExecutionVertex(executionVertexId);
        executionVertex.tryAssignResource(logicalSlot);
        startReserveAllocation(executionVertexId, logicalSlot.getAllocationId());
        return logicalSlot;
    };
}
Also used : ShuffleMaster(org.apache.flink.runtime.shuffle.ShuffleMaster) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) BiFunction(java.util.function.BiFunction) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Vertex(org.apache.flink.runtime.topology.Vertex) Map(java.util.Map) SchedulingTopology(org.apache.flink.runtime.scheduler.strategy.SchedulingTopology) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) CoLocationGroup(org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup) SchedulingStrategyFactory(org.apache.flink.runtime.scheduler.strategy.SchedulingStrategyFactory) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) JobManagerJobMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup) Collection(java.util.Collection) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) List(java.util.List) FailoverStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.FailoverStrategy) Optional(java.util.Optional) ExecutionFailureHandler(org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) SlotSharingGroup(org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) FailureHandlingResult(org.apache.flink.runtime.executiongraph.failover.flip1.FailureHandlingResult) HashSet(java.util.HashSet) OperatorCoordinatorHolder(org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) SchedulingStrategy(org.apache.flink.runtime.scheduler.strategy.SchedulingStrategy) Nullable(javax.annotation.Nullable) Preconditions.checkState(org.apache.flink.util.Preconditions.checkState) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Logger(org.slf4j.Logger) Executor(java.util.concurrent.Executor) Configuration(org.apache.flink.configuration.Configuration) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) IterableUtils(org.apache.flink.util.IterableUtils) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) RestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartBackoffTimeStrategy) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) FailureHandlingResultSnapshot(org.apache.flink.runtime.scheduler.exceptionhistory.FailureHandlingResultSnapshot) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) CompletionException(java.util.concurrent.CompletionException) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex)

Aggregations

ExecutionVertexID (org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID)231 Test (org.junit.Test)165 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)63 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)57 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)54 SchedulingExecutionVertex (org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex)51 Set (java.util.Set)48 IntermediateResultPartitionID (org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)45 AdaptiveSchedulerTest (org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest)45 TestingSchedulingExecutionVertex (org.apache.flink.runtime.scheduler.strategy.TestingSchedulingExecutionVertex)45 Collection (java.util.Collection)33 TestingSchedulingTopology (org.apache.flink.runtime.scheduler.strategy.TestingSchedulingTopology)33 HashSet (java.util.HashSet)30 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)30 ArrayList (java.util.ArrayList)27 Map (java.util.Map)27 HashMap (java.util.HashMap)24 List (java.util.List)24 CompletableFuture (java.util.concurrent.CompletableFuture)24 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)24