use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.
the class CheckpointCoordinator method sendAbortedMessages.
private void sendAbortedMessages(List<ExecutionVertex> tasksToAbort, long checkpointId, long timeStamp) {
assert (Thread.holdsLock(lock));
long latestCompletedCheckpointId = completedCheckpointStore.getLatestCheckpointId();
// send notification of aborted checkpoints asynchronously.
executor.execute(() -> {
// send the "abort checkpoint" messages to necessary vertices.
for (ExecutionVertex ev : tasksToAbort) {
Execution ee = ev.getCurrentExecutionAttempt();
if (ee != null) {
ee.notifyCheckpointAborted(checkpointId, latestCompletedCheckpointId, timeStamp);
}
}
});
// commit coordinators
for (OperatorCoordinatorCheckpointContext coordinatorContext : coordinatorsToCheckpoint) {
coordinatorContext.notifyCheckpointAborted(checkpointId);
}
}
use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.
the class DefaultOperatorCoordinatorHandler method deliverOperatorEventToCoordinator.
@Override
public void deliverOperatorEventToCoordinator(final ExecutionAttemptID taskExecutionId, final OperatorID operatorId, final OperatorEvent evt) throws FlinkException {
// Failure semantics (as per the javadocs of the method):
// If the task manager sends an event for a non-running task or an non-existing operator
// coordinator, then respond with an exception to the call. If task and coordinator exist,
// then we assume that the call from the TaskManager was valid, and any bubbling exception
// needs to cause a job failure.
final Execution exec = executionGraph.getRegisteredExecutions().get(taskExecutionId);
if (exec == null || exec.getState() != ExecutionState.RUNNING && exec.getState() != ExecutionState.INITIALIZING) {
// on the safe, we notify the TM that the event could not be delivered.
throw new TaskNotRunningException("Task is not known or in state running on the JobManager.");
}
final OperatorCoordinatorHolder coordinator = coordinatorMap.get(operatorId);
if (coordinator == null) {
throw new FlinkException("No coordinator registered for operator " + operatorId);
}
try {
coordinator.handleEventFromOperator(exec.getParallelSubtaskIndex(), evt);
} catch (Throwable t) {
ExceptionUtils.rethrowIfFatalErrorOrOOM(t);
globalFailureHandler.handleGlobalFailure(t);
}
}
use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.
the class DefaultCheckpointPlanCalculator method calculateAfterTasksFinished.
/**
* Calculates the checkpoint plan after some tasks have finished. We iterate the job graph to
* find the task that is still running, but do not has precedent running tasks.
*
* @return The plan of this checkpoint.
*/
private CheckpointPlan calculateAfterTasksFinished() {
// First collect the task running status into BitSet so that we could
// do JobVertex level judgement for some vertices and avoid time-consuming
// access to volatile isFinished flag of Execution.
Map<JobVertexID, BitSet> taskRunningStatusByVertex = collectTaskRunningStatus();
List<Execution> tasksToTrigger = new ArrayList<>();
List<Execution> tasksToWaitFor = new ArrayList<>();
List<ExecutionVertex> tasksToCommitTo = new ArrayList<>();
List<Execution> finishedTasks = new ArrayList<>();
List<ExecutionJobVertex> fullyFinishedJobVertex = new ArrayList<>();
for (ExecutionJobVertex jobVertex : jobVerticesInTopologyOrder) {
BitSet taskRunningStatus = taskRunningStatusByVertex.get(jobVertex.getJobVertexId());
if (taskRunningStatus.cardinality() == 0) {
fullyFinishedJobVertex.add(jobVertex);
for (ExecutionVertex task : jobVertex.getTaskVertices()) {
finishedTasks.add(task.getCurrentExecutionAttempt());
}
continue;
}
List<JobEdge> prevJobEdges = jobVertex.getJobVertex().getInputs();
// this is an optimization: we determine at the JobVertex level if some tasks can even
// be eligible for being in the "triggerTo" set.
boolean someTasksMustBeTriggered = someTasksMustBeTriggered(taskRunningStatusByVertex, prevJobEdges);
for (int i = 0; i < jobVertex.getTaskVertices().length; ++i) {
ExecutionVertex task = jobVertex.getTaskVertices()[i];
if (taskRunningStatus.get(task.getParallelSubtaskIndex())) {
tasksToWaitFor.add(task.getCurrentExecutionAttempt());
tasksToCommitTo.add(task);
if (someTasksMustBeTriggered) {
boolean hasRunningPrecedentTasks = hasRunningPrecedentTasks(task, prevJobEdges, taskRunningStatusByVertex);
if (!hasRunningPrecedentTasks) {
tasksToTrigger.add(task.getCurrentExecutionAttempt());
}
}
} else {
finishedTasks.add(task.getCurrentExecutionAttempt());
}
}
}
return new DefaultCheckpointPlan(Collections.unmodifiableList(tasksToTrigger), Collections.unmodifiableList(tasksToWaitFor), Collections.unmodifiableList(tasksToCommitTo), Collections.unmodifiableList(finishedTasks), Collections.unmodifiableList(fullyFinishedJobVertex), allowCheckpointsAfterTasksFinished);
}
use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.
the class RootExceptionHistoryEntry method fromFailureHandlingResultSnapshot.
/**
* Creates a {@code RootExceptionHistoryEntry} based on the passed {@link
* FailureHandlingResultSnapshot}.
*
* @param snapshot The reason for the failure.
* @return The {@code RootExceptionHistoryEntry} instance.
* @throws NullPointerException if {@code cause} or {@code failingTaskName} are {@code null}.
* @throws IllegalArgumentException if the {@code timestamp} of the passed {@code
* FailureHandlingResult} is not bigger than {@code 0}.
*/
public static RootExceptionHistoryEntry fromFailureHandlingResultSnapshot(FailureHandlingResultSnapshot snapshot) {
String failingTaskName = null;
TaskManagerLocation taskManagerLocation = null;
if (snapshot.getRootCauseExecution().isPresent()) {
final Execution rootCauseExecution = snapshot.getRootCauseExecution().get();
failingTaskName = rootCauseExecution.getVertexWithAttempt();
taskManagerLocation = rootCauseExecution.getAssignedResourceLocation();
}
return createRootExceptionHistoryEntry(snapshot.getRootCause(), snapshot.getTimestamp(), failingTaskName, taskManagerLocation, snapshot.getConcurrentlyFailedExecution());
}
use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.
the class FailureHandlingResultSnapshot method create.
/**
* Creates a {@code FailureHandlingResultSnapshot} based on the passed {@link
* FailureHandlingResult} and {@link ExecutionVertex ExecutionVertices}.
*
* @param failureHandlingResult The {@code FailureHandlingResult} that is used for extracting
* the failure information.
* @param latestExecutionLookup The look-up function for retrieving the latest {@link Execution}
* instance for a given {@link ExecutionVertexID}.
* @return The {@code FailureHandlingResultSnapshot}.
*/
public static FailureHandlingResultSnapshot create(FailureHandlingResult failureHandlingResult, Function<ExecutionVertexID, Execution> latestExecutionLookup) {
final Execution rootCauseExecution = failureHandlingResult.getExecutionVertexIdOfFailedTask().map(latestExecutionLookup).orElse(null);
Preconditions.checkArgument(rootCauseExecution == null || rootCauseExecution.getFailureInfo().isPresent(), String.format("The execution %s didn't provide a failure info even though the corresponding ExecutionVertex %s is marked as having handled the root cause of this failure.", // added to make the compiler happy
rootCauseExecution != null ? rootCauseExecution.getAttemptId() : "(null)", failureHandlingResult.getExecutionVertexIdOfFailedTask().map(Objects::toString).orElse("(null)")));
final ExecutionVertexID rootCauseExecutionVertexId = failureHandlingResult.getExecutionVertexIdOfFailedTask().orElse(null);
final Set<Execution> concurrentlyFailedExecutions = failureHandlingResult.getVerticesToRestart().stream().filter(executionVertexId -> !executionVertexId.equals(rootCauseExecutionVertexId)).map(latestExecutionLookup).filter(execution -> execution.getFailureInfo().isPresent()).collect(Collectors.toSet());
return new FailureHandlingResultSnapshot(rootCauseExecution, ErrorInfo.handleMissingThrowable(failureHandlingResult.getError()), failureHandlingResult.getTimestamp(), concurrentlyFailedExecutions);
}
Aggregations