Search in sources :

Example 31 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class CheckpointCoordinator method startTriggeringCheckpoint.

private void startTriggeringCheckpoint(CheckpointTriggerRequest request) {
    try {
        synchronized (lock) {
            preCheckGlobalState(request.isPeriodic);
        }
        // we will actually trigger this checkpoint!
        Preconditions.checkState(!isTriggering);
        isTriggering = true;
        final long timestamp = System.currentTimeMillis();
        CompletableFuture<CheckpointPlan> checkpointPlanFuture = checkpointPlanCalculator.calculateCheckpointPlan();
        boolean initializeBaseLocations = !baseLocationsForCheckpointInitialized;
        baseLocationsForCheckpointInitialized = true;
        final CompletableFuture<PendingCheckpoint> pendingCheckpointCompletableFuture = checkpointPlanFuture.thenApplyAsync(plan -> {
            try {
                CheckpointIdAndStorageLocation checkpointIdAndStorageLocation = initializeCheckpoint(request.props, request.externalSavepointLocation, initializeBaseLocations);
                return new Tuple2<>(plan, checkpointIdAndStorageLocation);
            } catch (Throwable e) {
                throw new CompletionException(e);
            }
        }, executor).thenApplyAsync((checkpointInfo) -> createPendingCheckpoint(timestamp, request.props, checkpointInfo.f0, request.isPeriodic, checkpointInfo.f1.checkpointId, checkpointInfo.f1.checkpointStorageLocation, request.getOnCompletionFuture()), timer);
        final CompletableFuture<?> coordinatorCheckpointsComplete = pendingCheckpointCompletableFuture.thenComposeAsync((pendingCheckpoint) -> OperatorCoordinatorCheckpoints.triggerAndAcknowledgeAllCoordinatorCheckpointsWithCompletion(coordinatorsToCheckpoint, pendingCheckpoint, timer), timer);
        // We have to take the snapshot of the master hooks after the coordinator checkpoints
        // has completed.
        // This is to ensure the tasks are checkpointed after the OperatorCoordinators in case
        // ExternallyInducedSource is used.
        final CompletableFuture<?> masterStatesComplete = coordinatorCheckpointsComplete.thenComposeAsync(ignored -> {
            // If the code reaches here, the pending checkpoint is guaranteed to
            // be not null.
            // We use FutureUtils.getWithoutException() to make compiler happy
            // with checked
            // exceptions in the signature.
            PendingCheckpoint checkpoint = FutureUtils.getWithoutException(pendingCheckpointCompletableFuture);
            return snapshotMasterState(checkpoint);
        }, timer);
        FutureUtils.assertNoException(CompletableFuture.allOf(masterStatesComplete, coordinatorCheckpointsComplete).handleAsync((ignored, throwable) -> {
            final PendingCheckpoint checkpoint = FutureUtils.getWithoutException(pendingCheckpointCompletableFuture);
            Preconditions.checkState(checkpoint != null || throwable != null, "Either the pending checkpoint needs to be created or an error must have occurred.");
            if (throwable != null) {
                // the initialization might not be finished yet
                if (checkpoint == null) {
                    onTriggerFailure(request, throwable);
                } else {
                    onTriggerFailure(checkpoint, throwable);
                }
            } else {
                triggerCheckpointRequest(request, timestamp, checkpoint);
            }
            return null;
        }, timer).exceptionally(error -> {
            if (!isShutdown()) {
                throw new CompletionException(error);
            } else if (findThrowable(error, RejectedExecutionException.class).isPresent()) {
                LOG.debug("Execution rejected during shutdown");
            } else {
                LOG.warn("Error encountered during shutdown", error);
            }
            return null;
        }));
    } catch (Throwable throwable) {
        onTriggerFailure(request, throwable);
    }
}
Also used : SystemClock(org.apache.flink.util.clock.SystemClock) ScheduledFuture(java.util.concurrent.ScheduledFuture) Tuple2(org.apache.flink.api.java.tuple.Tuple2) PriorityQueue(java.util.PriorityQueue) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExceptionUtils(org.apache.flink.util.ExceptionUtils) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) ExceptionUtils.findThrowable(org.apache.flink.util.ExceptionUtils.findThrowable) Collectors.toMap(java.util.stream.Collectors.toMap) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Predicate(java.util.function.Predicate) CompletedCheckpointStorageLocation(org.apache.flink.runtime.state.CompletedCheckpointStorageLocation) Collection(java.util.Collection) MasterHooks(org.apache.flink.runtime.checkpoint.hooks.MasterHooks) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) GuardedBy(javax.annotation.concurrent.GuardedBy) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) List(java.util.List) Stream(java.util.stream.Stream) Preconditions.checkArgument(org.apache.flink.util.Preconditions.checkArgument) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) PossibleInconsistentStateException(org.apache.flink.runtime.persistence.PossibleInconsistentStateException) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) CheckpointStorageCoordinatorView(org.apache.flink.runtime.state.CheckpointStorageCoordinatorView) OperatorInfo(org.apache.flink.runtime.operators.coordination.OperatorInfo) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Clock(org.apache.flink.util.clock.Clock) ArrayList(java.util.ArrayList) Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) OptionalLong(java.util.OptionalLong) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) Nullable(javax.annotation.Nullable) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Logger(org.slf4j.Logger) CheckpointStorageLocation(org.apache.flink.runtime.state.CheckpointStorageLocation) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) Executor(java.util.concurrent.Executor) IOException(java.io.IOException) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) OperatorCoordinator(org.apache.flink.runtime.operators.coordination.OperatorCoordinator) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) ArrayDeque(java.util.ArrayDeque) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) Collections(java.util.Collections) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) CompletionException(java.util.concurrent.CompletionException) ExceptionUtils.findThrowable(org.apache.flink.util.ExceptionUtils.findThrowable)

Example 32 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class CheckpointCoordinator method triggerTasks.

private CompletableFuture<Void> triggerTasks(CheckpointTriggerRequest request, long timestamp, PendingCheckpoint checkpoint) {
    // no exception, no discarding, everything is OK
    final long checkpointId = checkpoint.getCheckpointID();
    final SnapshotType type;
    if (this.forceFullSnapshot && !request.props.isSavepoint()) {
        type = CheckpointType.FULL_CHECKPOINT;
    } else {
        type = request.props.getCheckpointType();
    }
    final CheckpointOptions checkpointOptions = CheckpointOptions.forConfig(type, checkpoint.getCheckpointStorageLocation().getLocationReference(), isExactlyOnceMode, unalignedCheckpointsEnabled, alignedCheckpointTimeout);
    // send messages to the tasks to trigger their checkpoints
    List<CompletableFuture<Acknowledge>> acks = new ArrayList<>();
    for (Execution execution : checkpoint.getCheckpointPlan().getTasksToTrigger()) {
        if (request.props.isSynchronous()) {
            acks.add(execution.triggerSynchronousSavepoint(checkpointId, timestamp, checkpointOptions));
        } else {
            acks.add(execution.triggerCheckpoint(checkpointId, timestamp, checkpointOptions));
        }
    }
    return FutureUtils.waitForAll(acks);
}
Also used : CompletableFuture(java.util.concurrent.CompletableFuture) Execution(org.apache.flink.runtime.executiongraph.Execution) ArrayList(java.util.ArrayList)

Example 33 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class DefaultCheckpointPlan method fulfillFinishedTaskStatus.

@Override
public void fulfillFinishedTaskStatus(Map<OperatorID, OperatorState> operatorStates) {
    if (!mayHaveFinishedTasks) {
        return;
    }
    Map<JobVertexID, ExecutionJobVertex> partlyFinishedVertex = new HashMap<>();
    for (Execution task : finishedTasks) {
        JobVertexID jobVertexId = task.getVertex().getJobvertexId();
        if (!fullyFinishedOrFinishedOnRestoreVertices.containsKey(jobVertexId)) {
            partlyFinishedVertex.put(jobVertexId, task.getVertex().getJobVertex());
        }
    }
    checkNoPartlyFinishedVertexUsedUnionListState(partlyFinishedVertex, operatorStates);
    checkNoPartlyOperatorsFinishedVertexUsedUnionListState(partlyFinishedVertex, operatorStates);
    fulfillFullyFinishedOrFinishedOnRestoreOperatorStates(operatorStates);
    fulfillSubtaskStateForPartiallyFinishedOperators(operatorStates);
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID)

Example 34 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class DefaultCheckpointPlan method fulfillSubtaskStateForPartiallyFinishedOperators.

private void fulfillSubtaskStateForPartiallyFinishedOperators(Map<OperatorID, OperatorState> operatorStates) {
    for (Execution finishedTask : finishedTasks) {
        ExecutionJobVertex jobVertex = finishedTask.getVertex().getJobVertex();
        for (OperatorIDPair operatorIDPair : jobVertex.getOperatorIDs()) {
            OperatorState operatorState = operatorStates.get(operatorIDPair.getGeneratedOperatorID());
            if (operatorState != null && operatorState.isFullyFinished()) {
                continue;
            }
            if (operatorState == null) {
                operatorState = new OperatorState(operatorIDPair.getGeneratedOperatorID(), jobVertex.getParallelism(), jobVertex.getMaxParallelism());
                operatorStates.put(operatorIDPair.getGeneratedOperatorID(), operatorState);
            }
            operatorState.putState(finishedTask.getParallelSubtaskIndex(), FinishedOperatorSubtaskState.INSTANCE);
        }
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 35 with Execution

use of org.apache.flink.runtime.executiongraph.Execution in project flink by apache.

the class SchedulerBase method archiveFromFailureHandlingResult.

protected final void archiveFromFailureHandlingResult(FailureHandlingResultSnapshot failureHandlingResult) {
    if (failureHandlingResult.getRootCauseExecution().isPresent()) {
        final Execution rootCauseExecution = failureHandlingResult.getRootCauseExecution().get();
        final RootExceptionHistoryEntry rootEntry = RootExceptionHistoryEntry.fromFailureHandlingResultSnapshot(failureHandlingResult);
        exceptionHistory.add(rootEntry);
        log.debug("Archive local failure causing attempt {} to fail: {}", rootCauseExecution.getAttemptId(), rootEntry.getExceptionAsString());
    } else {
        archiveGlobalFailure(failureHandlingResult.getRootCause(), failureHandlingResult.getTimestamp(), failureHandlingResult.getConcurrentlyFailedExecution());
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry)

Aggregations

Execution (org.apache.flink.runtime.executiongraph.Execution)45 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)26 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)11 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)8 JobID (org.apache.flink.api.common.JobID)7 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)7 Test (org.junit.Test)7 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)6 ArrayList (java.util.ArrayList)5 IOException (java.io.IOException)4 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)4 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)4 HashMap (java.util.HashMap)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 TimeoutException (java.util.concurrent.TimeoutException)3 Time (org.apache.flink.api.common.time.Time)3 PartitionProducerDisposedException (org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException)3 LogicalSlot (org.apache.flink.runtime.jobmaster.LogicalSlot)3 StackTraceSampleResponse (org.apache.flink.runtime.messages.StackTraceSampleResponse)3 Collection (java.util.Collection)2