Search in sources :

Example 31 with CheckpointMetaData

use of org.apache.flink.runtime.checkpoint.CheckpointMetaData in project flink by apache.

the class Task method triggerCheckpointBarrier.

// ------------------------------------------------------------------------
// Notifications on the invokable
// ------------------------------------------------------------------------
/**
 * Calls the invokable to trigger a checkpoint.
 *
 * @param checkpointID The ID identifying the checkpoint.
 * @param checkpointTimestamp The timestamp associated with the checkpoint.
 * @param checkpointOptions Options for performing this checkpoint.
 */
public void triggerCheckpointBarrier(final long checkpointID, final long checkpointTimestamp, final CheckpointOptions checkpointOptions) {
    final TaskInvokable invokable = this.invokable;
    final CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointID, checkpointTimestamp, System.currentTimeMillis());
    if (executionState == ExecutionState.RUNNING) {
        checkState(invokable instanceof CheckpointableTask, "invokable is not checkpointable");
        try {
            ((CheckpointableTask) invokable).triggerCheckpointAsync(checkpointMetaData, checkpointOptions).handle((triggerResult, exception) -> {
                if (exception != null || !triggerResult) {
                    declineCheckpoint(checkpointID, CheckpointFailureReason.TASK_FAILURE, exception);
                    return false;
                }
                return true;
            });
        } catch (RejectedExecutionException ex) {
            // This may happen if the mailbox is closed. It means that the task is shutting
            // down, so we just ignore it.
            LOG.debug("Triggering checkpoint {} for {} ({}) was rejected by the mailbox", checkpointID, taskNameWithSubtask, executionId);
            declineCheckpoint(checkpointID, CheckpointFailureReason.CHECKPOINT_DECLINED_TASK_CLOSING);
        } catch (Throwable t) {
            if (getExecutionState() == ExecutionState.RUNNING) {
                failExternally(new Exception("Error while triggering checkpoint " + checkpointID + " for " + taskNameWithSubtask, t));
            } else {
                LOG.debug("Encountered error while triggering checkpoint {} for " + "{} ({}) while being not in state running.", checkpointID, taskNameWithSubtask, executionId, t);
            }
        }
    } else {
        LOG.debug("Declining checkpoint request for non-running task {} ({}).", taskNameWithSubtask, executionId);
        // send back a message that we did not do the checkpoint
        declineCheckpoint(checkpointID, CheckpointFailureReason.CHECKPOINT_DECLINED_TASK_NOT_READY);
    }
}
Also used : TaskInvokable(org.apache.flink.runtime.jobgraph.tasks.TaskInvokable) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) CheckpointableTask(org.apache.flink.runtime.jobgraph.tasks.CheckpointableTask) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) WrappingRuntimeException(org.apache.flink.util.WrappingRuntimeException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) InvocationTargetException(java.lang.reflect.InvocationTargetException) FlinkException(org.apache.flink.util.FlinkException) RunnableWithException(org.apache.flink.util.function.RunnableWithException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) IOException(java.io.IOException)

Example 32 with CheckpointMetaData

use of org.apache.flink.runtime.checkpoint.CheckpointMetaData in project flink by apache.

the class MultipleInputStreamTask method triggerStopWithSavepointAsync.

private CompletableFuture<Boolean> triggerStopWithSavepointAsync(CheckpointMetaData checkpointMetaData, CheckpointOptions checkpointOptions) {
    CompletableFuture<Void> sourcesStopped = new CompletableFuture<>();
    final StopMode stopMode = ((SavepointType) checkpointOptions.getCheckpointType()).shouldDrain() ? StopMode.DRAIN : StopMode.NO_DRAIN;
    mainMailboxExecutor.execute(() -> {
        setSynchronousSavepoint(checkpointMetaData.getCheckpointId());
        FutureUtils.forward(FutureUtils.waitForAll(operatorChain.getSourceTaskInputs().stream().map(s -> s.getOperator().stop(stopMode)).collect(Collectors.toList())), sourcesStopped);
    }, "stop chained Flip-27 source for stop-with-savepoint --drain");
    return sourcesStopped.thenCompose(ignore -> triggerSourcesCheckpointAsync(checkpointMetaData, checkpointOptions));
}
Also used : StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) CheckpointMetricsBuilder(org.apache.flink.runtime.checkpoint.CheckpointMetricsBuilder) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) CheckpointedInputGate(org.apache.flink.streaming.runtime.io.checkpointing.CheckpointedInputGate) SavepointType(org.apache.flink.runtime.checkpoint.SavepointType) Watermark(org.apache.flink.streaming.api.watermark.Watermark) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) ArrayList(java.util.ArrayList) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) StreamPartitioner(org.apache.flink.streaming.runtime.partitioner.StreamPartitioner) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) Output(org.apache.flink.streaming.api.operators.Output) StreamTaskSourceInput(org.apache.flink.streaming.runtime.io.StreamTaskSourceInput) InputConfig(org.apache.flink.streaming.api.graph.StreamConfig.InputConfig) InputChannelInfo(org.apache.flink.runtime.checkpoint.channel.InputChannelInfo) Nullable(javax.annotation.Nullable) StreamEdge(org.apache.flink.streaming.api.graph.StreamEdge) StreamMultipleInputProcessorFactory(org.apache.flink.streaming.runtime.io.StreamMultipleInputProcessorFactory) IOException(java.io.IOException) CheckpointOptions(org.apache.flink.runtime.checkpoint.CheckpointOptions) CheckpointBarrierHandler(org.apache.flink.streaming.runtime.io.checkpointing.CheckpointBarrierHandler) InputProcessorUtil(org.apache.flink.streaming.runtime.io.checkpointing.InputProcessorUtil) Collectors(java.util.stream.Collectors) StopMode(org.apache.flink.runtime.io.network.api.StopMode) MetricNames(org.apache.flink.runtime.metrics.MetricNames) MultipleInputStreamOperator(org.apache.flink.streaming.api.operators.MultipleInputStreamOperator) List(java.util.List) SnapshotType(org.apache.flink.runtime.checkpoint.SnapshotType) CheckpointBarrier(org.apache.flink.runtime.io.network.api.CheckpointBarrier) MinWatermarkGauge(org.apache.flink.streaming.runtime.metrics.MinWatermarkGauge) Optional(java.util.Optional) Internal(org.apache.flink.annotation.Internal) IndexedInputGate(org.apache.flink.runtime.io.network.partition.consumer.IndexedInputGate) Environment(org.apache.flink.runtime.execution.Environment) WatermarkGauge(org.apache.flink.streaming.runtime.metrics.WatermarkGauge) CompletableFuture(java.util.concurrent.CompletableFuture) StopMode(org.apache.flink.runtime.io.network.api.StopMode)

Example 33 with CheckpointMetaData

use of org.apache.flink.runtime.checkpoint.CheckpointMetaData in project flink by apache.

the class SourceStreamTask method init.

@Override
protected void init() {
    // we check if the source is actually inducing the checkpoints, rather
    // than the trigger
    SourceFunction<?> source = mainOperator.getUserFunction();
    if (source instanceof ExternallyInducedSource) {
        externallyInducedCheckpoints = true;
        ExternallyInducedSource.CheckpointTrigger triggerHook = new ExternallyInducedSource.CheckpointTrigger() {

            @Override
            public void triggerCheckpoint(long checkpointId) throws FlinkException {
                // TODO - we need to see how to derive those. We should probably not
                // encode this in the
                // TODO -   source's trigger message, but do a handshake in this task
                // between the trigger
                // TODO -   message from the master, and the source's trigger
                // notification
                final CheckpointOptions checkpointOptions = CheckpointOptions.forConfig(CheckpointType.CHECKPOINT, CheckpointStorageLocationReference.getDefault(), configuration.isExactlyOnceCheckpointMode(), configuration.isUnalignedCheckpointsEnabled(), configuration.getAlignedCheckpointTimeout().toMillis());
                final long timestamp = System.currentTimeMillis();
                final CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, timestamp, timestamp);
                try {
                    SourceStreamTask.super.triggerCheckpointAsync(checkpointMetaData, checkpointOptions).get();
                } catch (RuntimeException e) {
                    throw e;
                } catch (Exception e) {
                    throw new FlinkException(e.getMessage(), e);
                }
            }
        };
        ((ExternallyInducedSource<?, ?>) source).setCheckpointTrigger(triggerHook);
    }
    getEnvironment().getMetricGroup().getIOMetricGroup().gauge(MetricNames.CHECKPOINT_START_DELAY_TIME, this::getAsyncCheckpointStartDelayNanos);
}
Also used : ExternallyInducedSource(org.apache.flink.streaming.api.checkpoint.ExternallyInducedSource) CheckpointOptions(org.apache.flink.runtime.checkpoint.CheckpointOptions) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) FlinkException(org.apache.flink.util.FlinkException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) ExecutionException(java.util.concurrent.ExecutionException) FlinkException(org.apache.flink.util.FlinkException)

Example 34 with CheckpointMetaData

use of org.apache.flink.runtime.checkpoint.CheckpointMetaData in project flink by apache.

the class StreamTaskTest method testAsyncCheckpointingConcurrentCloseAfterAcknowledge.

/**
 * FLINK-5667
 *
 * <p>Tests that a concurrent cancel operation does not discard the state handles of an
 * acknowledged checkpoint. The situation can only happen if the cancel call is executed after
 * Environment.acknowledgeCheckpoint() and before the CloseableRegistry.unregisterClosable()
 * call.
 */
@Test
public void testAsyncCheckpointingConcurrentCloseAfterAcknowledge() throws Exception {
    final OneShotLatch acknowledgeCheckpointLatch = new OneShotLatch();
    final OneShotLatch completeAcknowledge = new OneShotLatch();
    CheckpointResponder checkpointResponder = mock(CheckpointResponder.class);
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) {
            acknowledgeCheckpointLatch.trigger();
            // block here so that we can issue the concurrent cancel call
            while (true) {
                try {
                    // wait until we successfully await (no pun intended)
                    completeAcknowledge.await();
                    // when await() returns normally, we break out of the loop
                    break;
                } catch (InterruptedException e) {
                // survive interruptions that arise from thread pool
                // shutdown
                // production code cannot actually throw
                // InterruptedException from
                // checkpoint acknowledgement
                }
            }
            return null;
        }
    }).when(checkpointResponder).acknowledgeCheckpoint(any(JobID.class), any(ExecutionAttemptID.class), anyLong(), any(CheckpointMetrics.class), any(TaskStateSnapshot.class));
    TaskStateManager taskStateManager = new TaskStateManagerImpl(new JobID(1L, 2L), new ExecutionAttemptID(), mock(TaskLocalStateStoreImpl.class), new InMemoryStateChangelogStorage(), null, checkpointResponder);
    KeyedStateHandle managedKeyedStateHandle = mock(KeyedStateHandle.class);
    KeyedStateHandle rawKeyedStateHandle = mock(KeyedStateHandle.class);
    OperatorStateHandle managedOperatorStateHandle = mock(OperatorStreamStateHandle.class);
    OperatorStateHandle rawOperatorStateHandle = mock(OperatorStreamStateHandle.class);
    OperatorSnapshotFutures operatorSnapshotResult = new OperatorSnapshotFutures(DoneFuture.of(SnapshotResult.of(managedKeyedStateHandle)), DoneFuture.of(SnapshotResult.of(rawKeyedStateHandle)), DoneFuture.of(SnapshotResult.of(managedOperatorStateHandle)), DoneFuture.of(SnapshotResult.of(rawOperatorStateHandle)), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()));
    try (MockEnvironment mockEnvironment = new MockEnvironmentBuilder().setTaskName("mock-task").setTaskStateManager(taskStateManager).build()) {
        RunningTask<MockStreamTask> task = runTask(() -> createMockStreamTask(mockEnvironment, operatorChain(streamOperatorWithSnapshot(operatorSnapshotResult))));
        MockStreamTask streamTask = task.streamTask;
        waitTaskIsRunning(streamTask, task.invocationFuture);
        final long checkpointId = 42L;
        streamTask.triggerCheckpointAsync(new CheckpointMetaData(checkpointId, 1L), CheckpointOptions.forCheckpointWithDefaultLocation());
        acknowledgeCheckpointLatch.await();
        ArgumentCaptor<TaskStateSnapshot> subtaskStateCaptor = ArgumentCaptor.forClass(TaskStateSnapshot.class);
        // check that the checkpoint has been completed
        verify(checkpointResponder).acknowledgeCheckpoint(any(JobID.class), any(ExecutionAttemptID.class), eq(checkpointId), any(CheckpointMetrics.class), subtaskStateCaptor.capture());
        TaskStateSnapshot subtaskStates = subtaskStateCaptor.getValue();
        OperatorSubtaskState subtaskState = subtaskStates.getSubtaskStateMappings().iterator().next().getValue();
        // check that the subtask state contains the expected state handles
        assertEquals(singleton(managedKeyedStateHandle), subtaskState.getManagedKeyedState());
        assertEquals(singleton(rawKeyedStateHandle), subtaskState.getRawKeyedState());
        assertEquals(singleton(managedOperatorStateHandle), subtaskState.getManagedOperatorState());
        assertEquals(singleton(rawOperatorStateHandle), subtaskState.getRawOperatorState());
        // check that the state handles have not been discarded
        verify(managedKeyedStateHandle, never()).discardState();
        verify(rawKeyedStateHandle, never()).discardState();
        verify(managedOperatorStateHandle, never()).discardState();
        verify(rawOperatorStateHandle, never()).discardState();
        streamTask.cancel();
        completeAcknowledge.trigger();
        // canceling the stream task after it has acknowledged the checkpoint should not discard
        // the state handles
        verify(managedKeyedStateHandle, never()).discardState();
        verify(rawKeyedStateHandle, never()).discardState();
        verify(managedOperatorStateHandle, never()).discardState();
        verify(rawOperatorStateHandle, never()).discardState();
        task.waitForTaskCompletion(true);
    }
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) TaskStateManagerImpl(org.apache.flink.runtime.state.TaskStateManagerImpl) MockEnvironmentBuilder(org.apache.flink.runtime.operators.testutils.MockEnvironmentBuilder) CheckpointMetrics(org.apache.flink.runtime.checkpoint.CheckpointMetrics) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) InMemoryStateChangelogStorage(org.apache.flink.runtime.state.changelog.inmemory.InMemoryStateChangelogStorage) MockEnvironment(org.apache.flink.runtime.operators.testutils.MockEnvironment) TaskLocalStateStoreImpl(org.apache.flink.runtime.state.TaskLocalStateStoreImpl) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) TaskStateManager(org.apache.flink.runtime.state.TaskStateManager) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Answer(org.mockito.stubbing.Answer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 35 with CheckpointMetaData

use of org.apache.flink.runtime.checkpoint.CheckpointMetaData in project flink by apache.

the class StreamTaskTest method testUncaughtExceptionInAsynchronousCheckpointingOperation.

/**
 * Tests that uncaught exceptions in the async part of a checkpoint operation are forwarded to
 * the uncaught exception handler. See <a
 * href="https://issues.apache.org/jira/browse/FLINK-12889">FLINK-12889</a>.
 */
@Test
public void testUncaughtExceptionInAsynchronousCheckpointingOperation() throws Exception {
    final RuntimeException failingCause = new RuntimeException("Test exception");
    FailingDummyEnvironment failingDummyEnvironment = new FailingDummyEnvironment(failingCause);
    // mock the returned snapshots
    OperatorSnapshotFutures operatorSnapshotResult = new OperatorSnapshotFutures(ExceptionallyDoneFuture.of(failingCause), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()), DoneFuture.of(SnapshotResult.empty()));
    final TestingUncaughtExceptionHandler uncaughtExceptionHandler = new TestingUncaughtExceptionHandler();
    RunningTask<MockStreamTask> task = runTask(() -> new MockStreamTask(failingDummyEnvironment, operatorChain(streamOperatorWithSnapshot(operatorSnapshotResult)), uncaughtExceptionHandler));
    MockStreamTask streamTask = task.streamTask;
    waitTaskIsRunning(streamTask, task.invocationFuture);
    streamTask.triggerCheckpointAsync(new CheckpointMetaData(42L, 1L), CheckpointOptions.forCheckpointWithDefaultLocation());
    final Throwable uncaughtException = uncaughtExceptionHandler.waitForUncaughtException();
    assertThat(uncaughtException, is(failingCause));
    streamTask.finishInput();
    task.waitForTaskCompletion(false);
}
Also used : OperatorSnapshotFutures(org.apache.flink.streaming.api.operators.OperatorSnapshotFutures) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) TestingUncaughtExceptionHandler(org.apache.flink.util.concurrent.TestingUncaughtExceptionHandler) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) Test(org.junit.Test)

Aggregations

CheckpointMetaData (org.apache.flink.runtime.checkpoint.CheckpointMetaData)47 Test (org.junit.Test)33 CheckpointMetricsBuilder (org.apache.flink.runtime.checkpoint.CheckpointMetricsBuilder)16 CheckpointOptions (org.apache.flink.runtime.checkpoint.CheckpointOptions)15 TaskStateSnapshot (org.apache.flink.runtime.checkpoint.TaskStateSnapshot)13 IOException (java.io.IOException)12 CheckpointMetrics (org.apache.flink.runtime.checkpoint.CheckpointMetrics)12 MockEnvironment (org.apache.flink.runtime.operators.testutils.MockEnvironment)11 StreamConfig (org.apache.flink.streaming.api.graph.StreamConfig)11 OperatorSnapshotFutures (org.apache.flink.streaming.api.operators.OperatorSnapshotFutures)11 JobID (org.apache.flink.api.common.JobID)10 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)10 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)10 ExecutionException (java.util.concurrent.ExecutionException)9 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)9 CancelTaskException (org.apache.flink.runtime.execution.CancelTaskException)8 TestTaskStateManager (org.apache.flink.runtime.state.TestTaskStateManager)8 CheckpointResponder (org.apache.flink.runtime.taskmanager.CheckpointResponder)7 FlinkRuntimeException (org.apache.flink.util.FlinkRuntimeException)7 CompletableFuture (java.util.concurrent.CompletableFuture)6