Search in sources :

Example 1 with OperatorSnapshotFutures

use of org.apache.flink.streaming.api.operators.OperatorSnapshotFutures in project flink by apache.

the class SubtaskCheckpointCoordinatorImpl method checkpointState.

@Override
public void checkpointState(CheckpointMetaData metadata, CheckpointOptions options, CheckpointMetricsBuilder metrics, OperatorChain<?, ?> operatorChain, boolean isTaskFinished, Supplier<Boolean> isRunning) throws Exception {
    checkNotNull(options);
    checkNotNull(metrics);
    if (lastCheckpointId >= metadata.getCheckpointId()) {
        LOG.info("Out of order checkpoint barrier (aborted previously?): {} >= {}", lastCheckpointId, metadata.getCheckpointId());
        channelStateWriter.abort(metadata.getCheckpointId(), new CancellationException(), true);
        checkAndClearAbortedStatus(metadata.getCheckpointId());
        return;
    }
    logCheckpointProcessingDelay(metadata);
    // Step (0): Record the last triggered checkpointId and abort the sync phase of checkpoint
    // if necessary.
    lastCheckpointId = metadata.getCheckpointId();
    if (checkAndClearAbortedStatus(metadata.getCheckpointId())) {
        // broadcast cancel checkpoint marker to avoid downstream back-pressure due to
        // checkpoint barrier align.
        operatorChain.broadcastEvent(new CancelCheckpointMarker(metadata.getCheckpointId()));
        LOG.info("Checkpoint {} has been notified as aborted, would not trigger any checkpoint.", metadata.getCheckpointId());
        return;
    }
    // connection), revert it here so that it can jump over output data
    if (options.getAlignment() == CheckpointOptions.AlignmentType.FORCED_ALIGNED) {
        options = options.withUnalignedSupported();
        initInputsCheckpoint(metadata.getCheckpointId(), options);
    }
    // Step (1): Prepare the checkpoint, allow operators to do some pre-barrier work.
    // The pre-barrier work should be nothing or minimal in the common case.
    operatorChain.prepareSnapshotPreBarrier(metadata.getCheckpointId());
    // Step (2): Send the checkpoint barrier downstream
    operatorChain.broadcastEvent(new CheckpointBarrier(metadata.getCheckpointId(), metadata.getTimestamp(), options), options.isUnalignedCheckpoint());
    // Step (3): Prepare to spill the in-flight buffers for input and output
    if (options.isUnalignedCheckpoint()) {
        // output data already written while broadcasting event
        channelStateWriter.finishOutput(metadata.getCheckpointId());
    }
    // Step (4): Take the state snapshot. This should be largely asynchronous, to not impact
    // progress of the
    // streaming topology
    Map<OperatorID, OperatorSnapshotFutures> snapshotFutures = new HashMap<>(operatorChain.getNumberOfOperators());
    try {
        if (takeSnapshotSync(snapshotFutures, metadata, metrics, options, operatorChain, isRunning)) {
            finishAndReportAsync(snapshotFutures, metadata, metrics, operatorChain.isTaskDeployedAsFinished(), isTaskFinished, isRunning);
        } else {
            cleanup(snapshotFutures, metadata, metrics, new Exception("Checkpoint declined"));
        }
    } catch (Exception ex) {
        cleanup(snapshotFutures, metadata, metrics, ex);
        throw ex;
    }
}
Also used : CheckpointBarrier(org.apache.flink.runtime.io.network.api.CheckpointBarrier) OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) CancellationException(java.util.concurrent.CancellationException) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CancelCheckpointMarker(org.apache.flink.runtime.io.network.api.CancelCheckpointMarker) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) CancellationException(java.util.concurrent.CancellationException) IOException(java.io.IOException) BiFunctionWithException(org.apache.flink.util.function.BiFunctionWithException)

Example 2 with OperatorSnapshotFutures

use of org.apache.flink.streaming.api.operators.OperatorSnapshotFutures in project flink by apache.

the class AsyncCheckpointRunnable method finalizedFinishedSnapshots.

private SnapshotsFinalizeResult finalizedFinishedSnapshots() throws Exception {
    for (Map.Entry<OperatorID, OperatorSnapshotFutures> entry : operatorSnapshotsInProgress.entrySet()) {
        OperatorSnapshotFutures snapshotInProgress = entry.getValue();
        // We should wait for the channels states get completed before continuing,
        // otherwise the alignment of barriers might have not finished yet.
        snapshotInProgress.getInputChannelStateFuture().get();
        snapshotInProgress.getResultSubpartitionStateFuture().get();
    }
    return new SnapshotsFinalizeResult(TaskStateSnapshot.FINISHED_ON_RESTORE, TaskStateSnapshot.FINISHED_ON_RESTORE, 0L);
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Map(java.util.Map)

Example 3 with OperatorSnapshotFutures

use of org.apache.flink.streaming.api.operators.OperatorSnapshotFutures in project flink by apache.

the class RegularOperatorChain method buildOperatorSnapshotFutures.

private OperatorSnapshotFutures buildOperatorSnapshotFutures(CheckpointMetaData checkpointMetaData, CheckpointOptions checkpointOptions, StreamOperator<?> op, Supplier<Boolean> isRunning, ChannelStateWriter.ChannelStateWriteResult channelStateWriteResult, CheckpointStreamFactory storage) throws Exception {
    OperatorSnapshotFutures snapshotInProgress = checkpointStreamOperator(op, checkpointMetaData, checkpointOptions, storage, isRunning);
    snapshotChannelStates(op, channelStateWriteResult, snapshotInProgress);
    return snapshotInProgress;
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures)

Example 4 with OperatorSnapshotFutures

use of org.apache.flink.streaming.api.operators.OperatorSnapshotFutures in project flink by apache.

the class LocalStateForwardingTest method testReportingFromSnapshotToTaskStateManager.

/**
 * This tests the forwarding of jm and tm-local state from the futures reported by the backends,
 * through the async checkpointing thread to the {@link
 * org.apache.flink.runtime.state.TaskStateManager}.
 */
@Test
public void testReportingFromSnapshotToTaskStateManager() throws Exception {
    TestTaskStateManager taskStateManager = new TestTaskStateManager();
    StreamMockEnvironment streamMockEnvironment = new StreamMockEnvironment(new Configuration(), new Configuration(), new ExecutionConfig(), 1024 * 1024, new MockInputSplitProvider(), 0, taskStateManager);
    StreamTask testStreamTask = new StreamTaskTest.NoOpStreamTask(streamMockEnvironment);
    CheckpointMetaData checkpointMetaData = new CheckpointMetaData(0L, 0L);
    CheckpointMetricsBuilder checkpointMetrics = new CheckpointMetricsBuilder();
    Map<OperatorID, OperatorSnapshotFutures> snapshots = new HashMap<>(1);
    OperatorSnapshotFutures osFuture = new OperatorSnapshotFutures();
    osFuture.setKeyedStateManagedFuture(createSnapshotResult(KeyedStateHandle.class));
    osFuture.setKeyedStateRawFuture(createSnapshotResult(KeyedStateHandle.class));
    osFuture.setOperatorStateManagedFuture(createSnapshotResult(OperatorStateHandle.class));
    osFuture.setOperatorStateRawFuture(createSnapshotResult(OperatorStateHandle.class));
    osFuture.setInputChannelStateFuture(createSnapshotCollectionResult(InputChannelStateHandle.class));
    osFuture.setResultSubpartitionStateFuture(createSnapshotCollectionResult(ResultSubpartitionStateHandle.class));
    OperatorID operatorID = new OperatorID();
    snapshots.put(operatorID, osFuture);
    AsyncCheckpointRunnable checkpointRunnable = new AsyncCheckpointRunnable(snapshots, checkpointMetaData, checkpointMetrics, 0L, testStreamTask.getName(), asyncCheckpointRunnable -> {
    }, testStreamTask.getEnvironment(), testStreamTask, false, false, () -> true);
    checkpointMetrics.setAlignmentDurationNanos(0L);
    checkpointMetrics.setBytesProcessedDuringAlignment(0L);
    checkpointRunnable.run();
    TaskStateSnapshot lastJobManagerTaskStateSnapshot = taskStateManager.getLastJobManagerTaskStateSnapshot();
    TaskStateSnapshot lastTaskManagerTaskStateSnapshot = taskStateManager.getLastTaskManagerTaskStateSnapshot();
    OperatorSubtaskState jmState = lastJobManagerTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID);
    OperatorSubtaskState tmState = lastTaskManagerTaskStateSnapshot.getSubtaskStateByOperatorID(operatorID);
    performCheck(osFuture.getKeyedStateManagedFuture(), jmState.getManagedKeyedState(), tmState.getManagedKeyedState());
    performCheck(osFuture.getKeyedStateRawFuture(), jmState.getRawKeyedState(), tmState.getRawKeyedState());
    performCheck(osFuture.getOperatorStateManagedFuture(), jmState.getManagedOperatorState(), tmState.getManagedOperatorState());
    performCheck(osFuture.getOperatorStateRawFuture(), jmState.getRawOperatorState(), tmState.getRawOperatorState());
    performCollectionCheck(osFuture.getInputChannelStateFuture(), jmState.getInputChannelState(), tmState.getInputChannelState());
    performCollectionCheck(osFuture.getResultSubpartitionStateFuture(), jmState.getResultSubpartitionState(), tmState.getResultSubpartitionState());
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) Configuration(org.apache.flink.configuration.Configuration) CheckpointMetricsBuilder(org.apache.flink.runtime.checkpoint.CheckpointMetricsBuilder) HashMap(java.util.HashMap) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) TestTaskStateManager(org.apache.flink.runtime.state.TestTaskStateManager) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) ResultSubpartitionStateHandle(org.apache.flink.runtime.state.ResultSubpartitionStateHandle) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) MockInputSplitProvider(org.apache.flink.runtime.operators.testutils.MockInputSplitProvider) InputChannelStateHandle(org.apache.flink.runtime.state.InputChannelStateHandle) Test(org.junit.Test)

Example 5 with OperatorSnapshotFutures

use of org.apache.flink.streaming.api.operators.OperatorSnapshotFutures in project flink by apache.

the class AsyncCheckpointRunnableTest method testDeclineAsyncCheckpoint.

@Test
public void testDeclineAsyncCheckpoint() {
    CheckpointFailureReason originalReason = CheckpointFailureReason.CHECKPOINT_DECLINED_INPUT_END_OF_STREAM;
    final Map<OperatorID, OperatorSnapshotFutures> snapshotsInProgress = new HashMap<>();
    snapshotsInProgress.put(new OperatorID(), new OperatorSnapshotFutures(DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), ExceptionallyDoneFuture.of(new CheckpointException(originalReason)), DoneFuture.of(SnapshotResult.empty())));
    final TestEnvironment environment = new TestEnvironment();
    final AsyncCheckpointRunnable runnable = createAsyncRunnable(snapshotsInProgress, environment, false, true);
    runnable.run();
    Assert.assertSame(environment.getCause().getCheckpointFailureReason(), originalReason);
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) CheckpointFailureReason(org.apache.flink.runtime.checkpoint.CheckpointFailureReason) HashMap(java.util.HashMap) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Test(org.junit.Test)

Aggregations

OperatorSnapshotFutures (org.apache.flink.streaming.api.operators.OperatorSnapshotFutures)21 Test (org.junit.Test)12 CheckpointMetaData (org.apache.flink.runtime.checkpoint.CheckpointMetaData)9 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)6 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)5 MockEnvironment (org.apache.flink.runtime.operators.testutils.MockEnvironment)5 HashMap (java.util.HashMap)4 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)4 TaskStateSnapshot (org.apache.flink.runtime.checkpoint.TaskStateSnapshot)4 FlinkRuntimeException (org.apache.flink.util.FlinkRuntimeException)4 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)4 IOException (java.io.IOException)3 TimeoutException (java.util.concurrent.TimeoutException)3 CheckpointMetricsBuilder (org.apache.flink.runtime.checkpoint.CheckpointMetricsBuilder)3 DummyEnvironment (org.apache.flink.runtime.operators.testutils.DummyEnvironment)3 MockEnvironmentBuilder (org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder)3 AsynchronousException (org.apache.flink.runtime.taskmanager.AsynchronousException)3 Map (java.util.Map)2 ExecutionException (java.util.concurrent.ExecutionException)2 JobID (org.apache.flink.api.common.JobID)2