Search in sources :

Example 36 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorRestoringTest method acknowledgeCheckpoint.

private static void acknowledgeCheckpoint(CheckpointCoordinator coordinator, ExecutionGraph executionGraph, ExecutionJobVertex jobVertex, long checkpointId) throws Exception {
    final List<KeyGroupRange> partitions = StateAssignmentOperation.createKeyGroupPartitions(jobVertex.getMaxParallelism(), jobVertex.getParallelism());
    for (int partitionIdx = 0; partitionIdx < partitions.size(); partitionIdx++) {
        TaskStateSnapshot subtaskState = mockSubtaskState(jobVertex.getJobVertexId(), partitionIdx, partitions.get(partitionIdx));
        final AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(executionGraph.getJobID(), jobVertex.getTaskVertices()[partitionIdx].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskState);
        coordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
}
Also used : AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)

Example 37 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testJobGraphModificationsAreCheckedForSavepoint.

@Test
public void testJobGraphModificationsAreCheckedForSavepoint() throws Exception {
    final JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1).build();
    CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
    File savepointPath = tmpFolder.newFolder();
    CompletableFuture<CompletedCheckpoint> savepointFuture = coordinator.triggerSavepoint("file://" + savepointPath.getAbsolutePath(), SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    long pendingSavepointId = coordinator.getPendingCheckpoints().keySet().stream().findFirst().get();
    coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), graph.getJobVertex(jobVertexID).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId(), pendingSavepointId), "localhost");
    assertTrue(savepointFuture.isDone());
    BooleanValue checked = new BooleanValue(false);
    CheckpointCoordinator restoreCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setVertexFinishedStateCheckerFactory((vertices, states) -> new VertexFinishedStateChecker(vertices, states) {

        @Override
        public void validateOperatorsFinishedState() {
            checked.set(true);
        }
    }).build();
    restoreCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepointFuture.get().getExternalPointer()), graph.getAllVertices(), getClass().getClassLoader());
    assertTrue("The finished states should be checked when job is restored on startup", checked.get());
}
Also used : CheckpointCoordinatorTestingUtils.generatePartitionableStateHandle(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.generatePartitionableStateHandle) BooleanValue(org.apache.flink.types.BooleanValue) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) ChainedStateHandle(org.apache.flink.runtime.state.ChainedStateHandle) Tuple2(org.apache.flink.api.java.tuple.Tuple2) ArgumentMatchers.eq(org.mockito.ArgumentMatchers.eq) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) SharedStateRegistry(org.apache.flink.runtime.state.SharedStateRegistry) Random(java.util.Random) CheckpointCoordinatorTestingUtils.mockSubtaskState(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.mockSubtaskState) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) Collections.singletonList(java.util.Collections.singletonList) Map(java.util.Map) TestLogger(org.apache.flink.util.TestLogger) StateHandleDummyUtil.createNewResultSubpartitionStateHandle(org.apache.flink.runtime.checkpoint.StateHandleDummyUtil.createNewResultSubpartitionStateHandle) Assert.fail(org.junit.Assert.fail) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) Collections.emptyList(java.util.Collections.emptyList) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) Collection(java.util.Collection) Set(java.util.Set) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) List(java.util.List) StateHandleDummyUtil.createNewInputChannelStateHandle(org.apache.flink.runtime.checkpoint.StateHandleDummyUtil.createNewInputChannelStateHandle) Stream(java.util.stream.Stream) Assert.assertFalse(org.junit.Assert.assertFalse) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) TestCompletedCheckpointStorageLocation(org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) CheckpointCoordinatorTestingUtils.generateChainedPartitionableStateHandle(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.generateChainedPartitionableStateHandle) JobStatus(org.apache.flink.api.common.JobStatus) ArrayList(java.util.ArrayList) Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) Iterables(org.apache.flink.shaded.guava30.com.google.common.collect.Iterables) Before(org.junit.Before) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) Mockito.times(org.mockito.Mockito.times) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair) CheckpointCoordinatorTestingUtils.generateKeyGroupState(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.generateKeyGroupState) File(java.io.File) Mockito.verify(org.mockito.Mockito.verify) CheckpointCoordinatorTestingUtils.comparePartitionableState(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.comparePartitionableState) Executors(org.apache.flink.util.concurrent.Executors) Rule(org.junit.Rule) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Assert(org.junit.Assert) CheckpointCoordinatorTestingUtils.compareKeyedState(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.compareKeyedState) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) Assert.assertEquals(org.junit.Assert.assertEquals) CheckpointCoordinatorTestingUtils.verifyStateRestore(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.verifyStateRestore) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) BooleanValue(org.apache.flink.types.BooleanValue) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) File(java.io.File) Test(org.junit.Test)

Example 38 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithoutInFlightData.

@Test
public void testRestoreLatestCheckpointedStateWithoutInFlightData() throws Exception {
    // given: Operator with not empty states.
    final JobVertexID jobVertexID = new JobVertexID();
    int parallelism1 = 3;
    int maxParallelism1 = 42;
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, parallelism1, maxParallelism1).build();
    final ExecutionJobVertex jobVertex = graph.getJobVertex(jobVertexID);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(1).build()).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger the checkpoint
    coord.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, coord.getPendingCheckpoints().size());
    long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
    List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
    Random random = new Random();
    // fill the states and complete the checkpoint.
    for (int index = 0; index < jobVertex.getParallelism(); index++) {
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, true)).setManagedKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), false)).setRawKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), true)).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, random))).setResultSubpartitionState(StateObjectCollection.singleton(createNewResultSubpartitionStateHandle(3, random))).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    assertEquals(1, coord.getSuccessfulCheckpoints().size());
    // when: Restore latest checkpoint without in-flight data.
    Set<ExecutionJobVertex> tasks = new HashSet<>();
    tasks.add(jobVertex);
    assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
    // then: All states should be restored successfully except InputChannel and
    // ResultSubpartition which should be ignored.
    verifyStateRestore(jobVertexID, jobVertex, keyGroupPartitions1);
    for (int i = 0; i < jobVertex.getParallelism(); i++) {
        JobManagerTaskRestore taskRestore = jobVertex.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
        OperatorSubtaskState operatorState = stateSnapshot.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID));
        assertTrue(operatorState.getInputChannelState().isEmpty());
        assertTrue(operatorState.getResultSubpartitionState().isEmpty());
        assertFalse(operatorState.getRawOperatorState().isEmpty());
        assertFalse(operatorState.getManagedOperatorState().isEmpty());
        assertFalse(operatorState.getRawKeyedState().isEmpty());
        assertFalse(operatorState.getManagedOperatorState().isEmpty());
    }
}
Also used : CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) Random(java.util.Random) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 39 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.

/**
 * Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
 * that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
 * checkpointing messages from other jobs should not be touched. A late acknowledge message is
 * an acknowledge message which arrives after the checkpoint has been declined.
 *
 * @throws Exception
 */
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(1).build();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    long checkpointId = pendingCheckpoint.getCheckpointId();
    OperatorID opIDtrigger = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStatesTrigger = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskStateTrigger = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStatesTrigger.putSubtaskStateByOperatorID(opIDtrigger, subtaskStateTrigger);
    // acknowledge the first trigger vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStatesTrigger), TASK_MANAGER_LOCATION_INFO);
    // verify that the subtask state has not been discarded
    verify(subtaskStateTrigger, never()).discardState();
    TaskStateSnapshot unknownSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState, times(1)).discardState();
    TaskStateSnapshot differentJobSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message from an unknown job
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    // duplicate acknowledge message for the trigger vertex
    TaskStateSnapshot triggerSubtaskState = mock(TaskStateSnapshot.class);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), triggerSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // duplicate acknowledge messages for a known vertex should not trigger discarding the state
    verify(triggerSubtaskState, never()).discardState();
    // let the checkpoint fail at the first ack vertex
    reset(subtaskStateTrigger);
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
    assertTrue(pendingCheckpoint.isDisposed());
    // check that we've cleaned up the already acknowledged state
    verify(subtaskStateTrigger, times(1)).discardState();
    TaskStateSnapshot ackSubtaskState = mock(TaskStateSnapshot.class);
    // late acknowledge message from the second ack vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), ackSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // check that we also cleaned up this state
    verify(ackSubtaskState, times(1)).discardState();
    // receive an acknowledge message from an unknown job
    reset(differentJobSubtaskState);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    TaskStateSnapshot unknownSubtaskState2 = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState2, times(1)).discardState();
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 40 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testCompleteCheckpointFailureWithExternallyInducedSource.

@Test
public void testCompleteCheckpointFailureWithExternallyInducedSource() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
    OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
    OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    taskOperatorSubtaskStates2.putSubtaskStateByOperatorID(opID2, subtaskState2);
    // Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
    // immediately.
    AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
    OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
        coordCheckpointDone.set(true);
        result.complete(new byte[0]);
    }).setOperatorID(opID1).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).setCheckpointStorage(new JobManagerCheckpointStorage() {

        private static final long serialVersionUID = 8134582566514272546L;

        // Throw exception when finalizing the checkpoint.
        @Override
        public CheckpointStorageAccess createCheckpointStorage(JobID jobId) throws IOException {
            return new MemoryBackendCheckpointStorageAccess(jobId, null, null, 100) {

                @Override
                public CheckpointStorageLocation initializeLocationForCheckpoint(long checkpointId) throws IOException {
                    return new NonPersistentMetadataCheckpointStorageLocation(1000) {

                        @Override
                        public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException {
                            throw new IOException("Artificial Exception");
                        }
                    };
                }
            };
        }
    }).build();
    AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
    // Add a master hook which triggers and acks the task checkpoint immediately.
    // In this case the task checkpoints would complete before the job master checkpoint
    // completes.
    checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {

        @Override
        public String getIdentifier() {
            return "anything";
        }

        @Override
        @Nullable
        public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
            assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
            // Acknowledge the checkpoint in the master hooks so the task snapshots
            // complete before
            // the master state snapshot completes.
            checkpointIdRef.set(checkpointId);
            AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
            AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
            return null;
        }

        @Override
        public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
        }

        @Override
        public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
            return new SimpleVersionedSerializer<Integer>() {

                @Override
                public int getVersion() {
                    return 0;
                }

                @Override
                public byte[] serialize(Integer obj) throws IOException {
                    return new byte[0];
                }

                @Override
                public Integer deserialize(int version, byte[] serialized) throws IOException {
                    return 1;
                }
            };
        }
    });
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertTrue(checkpointFuture.isCompletedExceptionally());
    assertTrue(checkpointCoordinator.getSuccessfulCheckpoints().isEmpty());
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) NonPersistentMetadataCheckpointStorageLocation(org.apache.flink.runtime.state.memory.NonPersistentMetadataCheckpointStorageLocation) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CompletableFuture(java.util.concurrent.CompletableFuture) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) MemoryBackendCheckpointStorageAccess(org.apache.flink.runtime.state.memory.MemoryBackendCheckpointStorageAccess) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) JobManagerCheckpointStorage(org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) JobID(org.apache.flink.api.common.JobID) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Aggregations

AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)45 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)35 Test (org.junit.Test)33 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)32 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)29 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)29 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)23 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)18 JobID (org.apache.flink.api.common.JobID)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)14 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)13 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 HashMap (java.util.HashMap)9 IOException (java.io.IOException)8 ArrayList (java.util.ArrayList)8 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)8 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)7 ByteStreamStateHandle (org.apache.flink.runtime.state.memory.ByteStreamStateHandle)7 ManuallyTriggeredScheduledExecutor (org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor)7 HashSet (java.util.HashSet)6