Search in sources :

Example 11 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class CheckpointCoordinator method sendAbortedMessages.

private void sendAbortedMessages(List<ExecutionVertex> tasksToAbort, long checkpointId, long timeStamp) {
    assert (Thread.holdsLock(lock));
    long latestCompletedCheckpointId = completedCheckpointStore.getLatestCheckpointId();
    // send notification of aborted checkpoints asynchronously.
    executor.execute(() -> {
        // send the "abort checkpoint" messages to necessary vertices.
        for (ExecutionVertex ev : tasksToAbort) {
            Execution ee = ev.getCurrentExecutionAttempt();
            if (ee != null) {
                ee.notifyCheckpointAborted(checkpointId, latestCompletedCheckpointId, timeStamp);
            }
        }
    });
    // commit coordinators
    for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) {
        coordinatorContext.notifyCheckpointAborted(checkpointId);
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex)

Example 12 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class DefaultOperatorCoordinatorHandler method deliverOperatorEventToCoordinator.

@Override
public void deliverOperatorEventToCoordinator(final ExecutionAttemptID taskExecutionId, final OperatorID operatorId, final OperatorEvent evt) throws FlinkException {
    // Failure semantics (as per the javadocs of the method):
    // If the task manager sends an event for a non-running task or an non-existing operator
    // coordinator, then respond with an exception to the call. If task and coordinator exist,
    // then we assume that the call from the TaskManager was valid, and any bubbling exception
    // needs to cause a job failure.
    final Execution exec = executionGraph.getRegisteredExecutions().get(taskExecutionId);
    if (exec == null || exec.getState() != ExecutionState.RUNNING && exec.getState() != ExecutionState.INITIALIZING) {
        // on the safe, we notify the TM that the event could not be delivered.
        throw new TaskNotRunningException("Task is not known or in state running on the JobManager.");
    }
    final OperatorCoordinatorHolder coordinator = coordinatorMap.get(operatorId);
    if (coordinator == null) {
        throw new FlinkException("No coordinator registered for operator " + operatorId);
    }
    try {
        coordinator.handleEventFromOperator(exec.getParallelSubtaskIndex(), evt);
    } catch (Throwable t) {
        ExceptionUtils.rethrowIfFatalErrorOrOOM(t);
        globalFailureHandler.handleGlobalFailure(t);
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) OperatorCoordinatorHolder(org.apache.flink.runtime.operators.coordination.OperatorCoordinatorHolder) FlinkException(org.apache.flink.util.FlinkException)

Example 13 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class DefaultCheckpointPlanCalculator method calculateAfterTasksFinished.

/**
 * Calculates the checkpoint plan after some tasks have finished. We iterate the job graph to
 * find the task that is still running, but do not has precedent running tasks.
 *
 * @return The plan of this checkpoint.
 */
private CheckpointPlan calculateAfterTasksFinished() {
    // First collect the task running status into BitSet so that we could
    // do JobVertex level judgement for some vertices and avoid time-consuming
    // access to volatile isFinished flag of Execution.
    Map<JobVertexID, BitSet> taskRunningStatusByVertex = collectTaskRunningStatus();
    List<Execution> tasksToTrigger = new ArrayList<>();
    List<Execution> tasksToWaitFor = new ArrayList<>();
    List<ExecutionVertex> tasksToCommitTo = new ArrayList<>();
    List<Execution> finishedTasks = new ArrayList<>();
    List<ExecutionJobVertex> fullyFinishedJobVertex = new ArrayList<>();
    for (ExecutionJobVertex jobVertex : jobVerticesInTopologyOrder) {
        BitSet taskRunningStatus = taskRunningStatusByVertex.get(jobVertex.getJobVertexId());
        if (taskRunningStatus.cardinality() == 0) {
            fullyFinishedJobVertex.add(jobVertex);
            for (ExecutionVertex task : jobVertex.getTaskVertices()) {
                finishedTasks.add(task.getCurrentExecutionAttempt());
            }
            continue;
        }
        List<JobEdge> prevJobEdges = jobVertex.getJobVertex().getInputs();
        // this is an optimization: we determine at the JobVertex level if some tasks can even
        // be eligible for being in the "triggerTo" set.
        boolean someTasksMustBeTriggered = someTasksMustBeTriggered(taskRunningStatusByVertex, prevJobEdges);
        for (int i = 0; i < jobVertex.getTaskVertices().length; ++i) {
            ExecutionVertex task = jobVertex.getTaskVertices()[i];
            if (taskRunningStatus.get(task.getParallelSubtaskIndex())) {
                tasksToWaitFor.add(task.getCurrentExecutionAttempt());
                tasksToCommitTo.add(task);
                if (someTasksMustBeTriggered) {
                    boolean hasRunningPrecedentTasks = hasRunningPrecedentTasks(task, prevJobEdges, taskRunningStatusByVertex);
                    if (!hasRunningPrecedentTasks) {
                        tasksToTrigger.add(task.getCurrentExecutionAttempt());
                    }
                }
            } else {
                finishedTasks.add(task.getCurrentExecutionAttempt());
            }
        }
    }
    return new DefaultCheckpointPlan(Collections.unmodifiableList(tasksToTrigger), Collections.unmodifiableList(tasksToWaitFor), Collections.unmodifiableList(tasksToCommitTo), Collections.unmodifiableList(finishedTasks), Collections.unmodifiableList(fullyFinishedJobVertex), allowCheckpointsAfterTasksFinished);
}
Also used : JobEdge(org.apache.flink.runtime.jobgraph.JobEdge) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) BitSet(java.util.BitSet) ArrayList(java.util.ArrayList) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex)

Example 14 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class RootExceptionHistoryEntry method fromFailureHandlingResultSnapshot.

/**
 * Creates a {@code RootExceptionHistoryEntry} based on the passed {@link
 * FailureHandlingResultSnapshot}.
 *
 * @param snapshot The reason for the failure.
 * @return The {@code RootExceptionHistoryEntry} instance.
 * @throws NullPointerException if {@code cause} or {@code failingTaskName} are {@code null}.
 * @throws IllegalArgumentException if the {@code timestamp} of the passed {@code
 *     FailureHandlingResult} is not bigger than {@code 0}.
 */
public static RootExceptionHistoryEntry fromFailureHandlingResultSnapshot(FailureHandlingResultSnapshot snapshot) {
    String failingTaskName = null;
    TaskManagerLocation taskManagerLocation = null;
    if (snapshot.getRootCauseExecution().isPresent()) {
        final Execution rootCauseExecution = snapshot.getRootCauseExecution().get();
        failingTaskName = rootCauseExecution.getVertexWithAttempt();
        taskManagerLocation = rootCauseExecution.getAssignedResourceLocation();
    }
    return createRootExceptionHistoryEntry(snapshot.getRootCause(), snapshot.getTimestamp(), failingTaskName, taskManagerLocation, snapshot.getConcurrentlyFailedExecution());
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation)

Example 15 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class FailureHandlingResultSnapshot method create.

/**
 * Creates a {@code FailureHandlingResultSnapshot} based on the passed {@link
 * FailureHandlingResult} and {@link ExecutionVertex ExecutionVertices}.
 *
 * @param failureHandlingResult The {@code FailureHandlingResult} that is used for extracting
 *     the failure information.
 * @param latestExecutionLookup The look-up function for retrieving the latest {@link Execution}
 *     instance for a given {@link ExecutionVertexID}.
 * @return The {@code FailureHandlingResultSnapshot}.
 */
public static FailureHandlingResultSnapshot create(FailureHandlingResult failureHandlingResult, Function<ExecutionVertexID, Execution> latestExecutionLookup) {
    final Execution rootCauseExecution = failureHandlingResult.getExecutionVertexIdOfFailedTask().map(latestExecutionLookup).orElse(null);
    Preconditions.checkArgument(rootCauseExecution == null || rootCauseExecution.getFailureInfo().isPresent(), String.format("The execution %s didn't provide a failure info even though the corresponding ExecutionVertex %s is marked as having handled the root cause of this failure.", // added to make the compiler happy
    rootCauseExecution != null ? rootCauseExecution.getAttemptId() : "(null)", failureHandlingResult.getExecutionVertexIdOfFailedTask().map(Objects::toString).orElse("(null)")));
    final ExecutionVertexID rootCauseExecutionVertexId = failureHandlingResult.getExecutionVertexIdOfFailedTask().orElse(null);
    final Set<Execution> concurrentlyFailedExecutions = failureHandlingResult.getVerticesToRestart().stream().filter(executionVertexId -> !executionVertexId.equals(rootCauseExecutionVertexId)).map(latestExecutionLookup).filter(execution -> execution.getFailureInfo().isPresent()).collect(Collectors.toSet());
    return new FailureHandlingResultSnapshot(rootCauseExecution, ErrorInfo.handleMissingThrowable(failureHandlingResult.getError()), failureHandlingResult.getTimestamp(), concurrentlyFailedExecutions);
}
Also used : ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) Set(java.util.Set) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) Preconditions(org.apache.flink.util.Preconditions) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) Execution(org.apache.flink.runtime.executiongraph.Execution) FailureHandlingResult(org.apache.flink.runtime.executiongraph.failover.flip1.FailureHandlingResult) Objects(java.util.Objects) Optional(java.util.Optional) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Collections(java.util.Collections) Nullable(javax.annotation.Nullable) Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID)

Aggregations

Execution (org.apache.flink.runtime.executiongraph.Execution)45 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)26 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)11 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)8 JobID (org.apache.flink.api.common.JobID)7 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)7 Test (org.junit.Test)7 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)6 ArrayList (java.util.ArrayList)5 IOException (java.io.IOException)4 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)4 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)4 HashMap (java.util.HashMap)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 TimeoutException (java.util.concurrent.TimeoutException)3 Time (org.apache.flink.api.common.time.Time)3 PartitionProducerDisposedException (org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException)3 LogicalSlot (org.apache.flink.runtime.jobmaster.LogicalSlot)3 StackTraceSampleResponse (org.apache.flink.runtime.messages.StackTraceSampleResponse)3 Collection (java.util.Collection)2