Search in sources :

Example 41 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testSavepointScheduledInUnalignedMode.

@Test
public void testSavepointScheduledInUnalignedMode() throws Exception {
    int maxConcurrentCheckpoints = 1;
    int checkpointRequestsToSend = 10;
    int activeRequests = 0;
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(new JobVertexID()).build();
    CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setUnalignedCheckpointsEnabled(true).setMaxConcurrentCheckpoints(maxConcurrentCheckpoints).build()).setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
    try {
        List<Future<?>> checkpointFutures = new ArrayList<>(checkpointRequestsToSend);
        coordinator.startCheckpointScheduler();
        while (activeRequests < checkpointRequestsToSend) {
            checkpointFutures.add(coordinator.triggerCheckpoint(true));
            activeRequests++;
        }
        assertEquals(activeRequests - maxConcurrentCheckpoints, coordinator.getNumQueuedRequests());
        Future<?> savepointFuture = coordinator.triggerSavepoint("/tmp", SavepointFormatType.CANONICAL);
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(++activeRequests - maxConcurrentCheckpoints, coordinator.getNumQueuedRequests());
        coordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), new ExecutionAttemptID(), 1L, new CheckpointException(CHECKPOINT_DECLINED)), "none");
        manuallyTriggeredScheduledExecutor.triggerAll();
        // savepoint triggered
        activeRequests--;
        assertEquals(activeRequests - maxConcurrentCheckpoints, coordinator.getNumQueuedRequests());
        assertEquals(1, checkpointFutures.stream().filter(Future::isDone).count());
        assertFalse(savepointFuture.isDone());
        assertEquals(maxConcurrentCheckpoints, coordinator.getNumberOfPendingCheckpoints());
        CheckpointProperties props = coordinator.getPendingCheckpoints().values().iterator().next().getProps();
        assertTrue(props.isSavepoint());
        assertFalse(props.forceCheckpoint());
    } finally {
        coordinator.shutdown();
    }
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Future(java.util.concurrent.Future) ScheduledFuture(java.util.concurrent.ScheduledFuture) CompletableFuture(java.util.concurrent.CompletableFuture) Test(org.junit.Test)

Example 42 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.

/**
 * Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
 * that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
 * checkpointing messages from other jobs should not be touched. A late acknowledge message is
 * an acknowledge message which arrives after the checkpoint has been declined.
 *
 * @throws Exception
 */
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(1).build();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    long checkpointId = pendingCheckpoint.getCheckpointId();
    OperatorID opIDtrigger = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStatesTrigger = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskStateTrigger = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStatesTrigger.putSubtaskStateByOperatorID(opIDtrigger, subtaskStateTrigger);
    // acknowledge the first trigger vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStatesTrigger), TASK_MANAGER_LOCATION_INFO);
    // verify that the subtask state has not been discarded
    verify(subtaskStateTrigger, never()).discardState();
    TaskStateSnapshot unknownSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState, times(1)).discardState();
    TaskStateSnapshot differentJobSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message from an unknown job
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    // duplicate acknowledge message for the trigger vertex
    TaskStateSnapshot triggerSubtaskState = mock(TaskStateSnapshot.class);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), triggerSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // duplicate acknowledge messages for a known vertex should not trigger discarding the state
    verify(triggerSubtaskState, never()).discardState();
    // let the checkpoint fail at the first ack vertex
    reset(subtaskStateTrigger);
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
    assertTrue(pendingCheckpoint.isDisposed());
    // check that we've cleaned up the already acknowledged state
    verify(subtaskStateTrigger, times(1)).discardState();
    TaskStateSnapshot ackSubtaskState = mock(TaskStateSnapshot.class);
    // late acknowledge message from the second ack vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), ackSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // check that we also cleaned up this state
    verify(ackSubtaskState, times(1)).discardState();
    // receive an acknowledge message from an unknown job
    reset(differentJobSubtaskState);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    TaskStateSnapshot unknownSubtaskState2 = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState2, times(1)).discardState();
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 43 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testCompleteCheckpointFailureWithExternallyInducedSource.

@Test
public void testCompleteCheckpointFailureWithExternallyInducedSource() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
    OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
    OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    taskOperatorSubtaskStates2.putSubtaskStateByOperatorID(opID2, subtaskState2);
    // Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
    // immediately.
    AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
    OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
        coordCheckpointDone.set(true);
        result.complete(new byte[0]);
    }).setOperatorID(opID1).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).setCheckpointStorage(new JobManagerCheckpointStorage() {

        private static final long serialVersionUID = 8134582566514272546L;

        // Throw exception when finalizing the checkpoint.
        @Override
        public CheckpointStorageAccess createCheckpointStorage(JobID jobId) throws IOException {
            return new MemoryBackendCheckpointStorageAccess(jobId, null, null, 100) {

                @Override
                public CheckpointStorageLocation initializeLocationForCheckpoint(long checkpointId) throws IOException {
                    return new NonPersistentMetadataCheckpointStorageLocation(1000) {

                        @Override
                        public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException {
                            throw new IOException("Artificial Exception");
                        }
                    };
                }
            };
        }
    }).build();
    AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
    // Add a master hook which triggers and acks the task checkpoint immediately.
    // In this case the task checkpoints would complete before the job master checkpoint
    // completes.
    checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {

        @Override
        public String getIdentifier() {
            return "anything";
        }

        @Override
        @Nullable
        public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
            assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
            // Acknowledge the checkpoint in the master hooks so the task snapshots
            // complete before
            // the master state snapshot completes.
            checkpointIdRef.set(checkpointId);
            AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
            AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
            return null;
        }

        @Override
        public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
        }

        @Override
        public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
            return new SimpleVersionedSerializer<Integer>() {

                @Override
                public int getVersion() {
                    return 0;
                }

                @Override
                public byte[] serialize(Integer obj) throws IOException {
                    return new byte[0];
                }

                @Override
                public Integer deserialize(int version, byte[] serialized) throws IOException {
                    return 1;
                }
            };
        }
    });
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertTrue(checkpointFuture.isCompletedExceptionally());
    assertTrue(checkpointCoordinator.getSuccessfulCheckpoints().isEmpty());
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) NonPersistentMetadataCheckpointStorageLocation(org.apache.flink.runtime.state.memory.NonPersistentMetadataCheckpointStorageLocation) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CompletableFuture(java.util.concurrent.CompletableFuture) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) MemoryBackendCheckpointStorageAccess(org.apache.flink.runtime.state.memory.MemoryBackendCheckpointStorageAccess) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) JobManagerCheckpointStorage(org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) JobID(org.apache.flink.api.common.JobID) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Example 44 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testSuccessfulCheckpointSubsumesUnsuccessful.

@Test
public void testSuccessfulCheckpointSubsumesUnsuccessful() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    JobVertexID jobVertexID3 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).addJobVertex(jobVertexID3, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionVertex vertex3 = graph.getJobVertex(jobVertexID3).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID3 = vertex3.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(10);
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    PendingCheckpoint pending1 = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    long checkpointId1 = pending1.getCheckpointId();
    // trigger messages should have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId1, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID3 = vertex3.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates11 = spy(new TaskStateSnapshot());
    TaskStateSnapshot taskOperatorSubtaskStates12 = spy(new TaskStateSnapshot());
    TaskStateSnapshot taskOperatorSubtaskStates13 = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskState11 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState12 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState13 = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStates11.putSubtaskStateByOperatorID(opID1, subtaskState11);
    taskOperatorSubtaskStates12.putSubtaskStateByOperatorID(opID2, subtaskState12);
    taskOperatorSubtaskStates13.putSubtaskStateByOperatorID(opID3, subtaskState13);
    // acknowledge one of the three tasks
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates12), TASK_MANAGER_LOCATION_INFO);
    // start the second checkpoint
    gateway.resetCount();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    PendingCheckpoint pending2;
    {
        Iterator<PendingCheckpoint> all = checkpointCoordinator.getPendingCheckpoints().values().iterator();
        PendingCheckpoint cc1 = all.next();
        PendingCheckpoint cc2 = all.next();
        pending2 = pending1 == cc1 ? cc2 : cc1;
    }
    long checkpointId2 = pending2.getCheckpointId();
    TaskStateSnapshot taskOperatorSubtaskStates21 = spy(new TaskStateSnapshot());
    TaskStateSnapshot taskOperatorSubtaskStates22 = spy(new TaskStateSnapshot());
    TaskStateSnapshot taskOperatorSubtaskStates23 = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskState21 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState22 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState23 = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStates21.putSubtaskStateByOperatorID(opID1, subtaskState21);
    taskOperatorSubtaskStates22.putSubtaskStateByOperatorID(opID2, subtaskState22);
    taskOperatorSubtaskStates23.putSubtaskStateByOperatorID(opID3, subtaskState23);
    // trigger messages should have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId2, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    // we acknowledge one more task from the first checkpoint and the second
    // checkpoint completely. The second checkpoint should then subsume the first checkpoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates23), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates21), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates11), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates22), TASK_MANAGER_LOCATION_INFO);
    // now, the second checkpoint should be confirmed, and the first discarded
    // actually both pending checkpoints are discarded, and the second has been transformed
    // into a successful checkpoint
    assertTrue(pending1.isDisposed());
    assertTrue(pending2.isDisposed());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // validate that all received subtask states in the first checkpoint have been discarded
    verify(subtaskState11, times(1)).discardState();
    verify(subtaskState12, times(1)).discardState();
    // validate that all subtask states in the second checkpoint are not discarded
    verify(subtaskState21, never()).discardState();
    verify(subtaskState22, never()).discardState();
    verify(subtaskState23, never()).discardState();
    // validate the committed checkpoints
    List<CompletedCheckpoint> scs = checkpointCoordinator.getSuccessfulCheckpoints();
    CompletedCheckpoint success = scs.get(0);
    assertEquals(checkpointId2, success.getCheckpointID());
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(3, success.getOperatorStates().size());
    // the first confirm message should be out
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId2, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
    }
    // send the last remaining ack for the first checkpoint. This should not do anything
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates13), TASK_MANAGER_LOCATION_INFO);
    verify(subtaskState13, times(1)).discardState();
    checkpointCoordinator.shutdown();
    completedCheckpointStore.shutdown(JobStatus.FINISHED, new CheckpointsCleaner());
    // validate that the states in the second checkpoint have been discarded
    verify(subtaskState21, times(1)).discardState();
    verify(subtaskState22, times(1)).discardState();
    verify(subtaskState23, times(1)).discardState();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Iterator(java.util.Iterator) Test(org.junit.Test)

Example 45 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testPeriodicTriggering.

@Test
public void testPeriodicTriggering() {
    try {
        final long start = System.currentTimeMillis();
        CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
        JobVertexID jobVertexID = new JobVertexID();
        ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
        ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
        ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
        CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
        10).setCheckpointTimeout(// timeout is very long (200 s)
        200000).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build();
        CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come.
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        // start another sequence of periodic scheduling
        gateway.resetCount();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        checkpointCoordinator.shutdown();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionException(java.util.concurrent.ExecutionException) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Aggregations

CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)46 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)41 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)40 Test (org.junit.Test)37 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)30 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)27 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)22 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)13 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)13 HashSet (java.util.HashSet)12 CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)12 ManuallyTriggeredScheduledExecutor (org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor)10 CompletableFuture (java.util.concurrent.CompletableFuture)9 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)9 HashMap (java.util.HashMap)8 CheckpointCoordinatorConfigurationBuilder (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder)8 List (java.util.List)7 ExecutionException (java.util.concurrent.ExecutionException)7 JobID (org.apache.flink.api.common.JobID)7