Search in sources :

Example 11 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method setupCheckpointCoordinatorWithInactiveTasks.

private CheckpointCoordinator setupCheckpointCoordinatorWithInactiveTasks(CheckpointStorage checkpointStorage) throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).setTransitToRunning(false).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
    10).setCheckpointTimeout(// timeout is very long (200 s)
    200000).setMinPauseBetweenCheckpoints(// no extra delay
    0).setMaxConcurrentCheckpoints(// max two concurrent checkpoints
    2).build();
    CheckpointIDCounterWithOwner checkpointIDCounter = new CheckpointIDCounterWithOwner();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setCheckpointStorage(checkpointStorage).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointIDCounter(checkpointIDCounter).build();
    checkpointIDCounter.setOwner(checkpointCoordinator);
    checkpointCoordinator.startCheckpointScheduler();
    manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
    manuallyTriggeredScheduledExecutor.triggerAll();
    // no checkpoint should have started so far
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // now move the state to RUNNING
    vertex1.getCurrentExecutionAttempt().transitionState(ExecutionState.RUNNING);
    // the coordinator should start checkpointing now
    manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
    manuallyTriggeredScheduledExecutor.triggerAll();
    return checkpointCoordinator;
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)

Example 12 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testExternallyInducedSourceWithOperatorCoordinator.

/**
 * Test that the checkpoint still behave correctly when the task checkpoint is triggered by the
 * master hooks and finished before the master checkpoint. Also make sure that the operator
 * coordinators are checkpointed before starting the task checkpoint.
 */
@Test
public void testExternallyInducedSourceWithOperatorCoordinator() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
    OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
    OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID2, subtaskState2);
    // Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
    // immediately.
    AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
    OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
        coordCheckpointDone.set(true);
        result.complete(new byte[0]);
    }).setOperatorID(opID1).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).build();
    AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
    // Add a master hook which triggers and acks the task checkpoint immediately.
    // In this case the task checkpoints would complete before the job master checkpoint
    // completes.
    checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {

        @Override
        public String getIdentifier() {
            return "anything";
        }

        @Override
        @Nullable
        public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
            assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
            // Acknowledge the checkpoint in the master hooks so the task snapshots
            // complete before
            // the master state snapshot completes.
            checkpointIdRef.set(checkpointId);
            AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
            AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
            return null;
        }

        @Override
        public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
        }

        @Override
        public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
            return new SimpleVersionedSerializer<Integer>() {

                @Override
                public int getVersion() {
                    return 0;
                }

                @Override
                public byte[] serialize(Integer obj) throws IOException {
                    return new byte[0];
                }

                @Override
                public Integer deserialize(int version, byte[] serialized) throws IOException {
                    return 1;
                }
            };
        }
    });
    // Verify initial state.
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    // now we should have a completed checkpoint
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // the canceler should be removed now
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // validate that the relevant tasks got a confirmation message
    long checkpointId = checkpointIdRef.get();
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    CompletedCheckpoint success = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(2, success.getOperatorStates().size());
    checkpointCoordinator.shutdown();
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CompletableFuture(java.util.concurrent.CompletableFuture) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Example 13 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testExternalizedCheckpoints.

/**
 * Tests that the externalized checkpoint configuration is respected.
 */
@Test
public void testExternalizedCheckpoints() throws Exception {
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(new JobVertexID()).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointRetentionPolicy(CheckpointRetentionPolicy.RETAIN_ON_FAILURE).build();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
    CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    for (PendingCheckpoint checkpoint : checkpointCoordinator.getPendingCheckpoints().values()) {
        CheckpointProperties props = checkpoint.getProps();
        CheckpointProperties expected = CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.RETAIN_ON_FAILURE);
        assertEquals(expected, props);
    }
    // the now we should have a completed checkpoint
    checkpointCoordinator.shutdown();
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) Test(org.junit.Test)

Example 14 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testCheckpointAbortsIfTriggerTasksAreFinishedAndIOException.

@Test
public void testCheckpointAbortsIfTriggerTasksAreFinishedAndIOException() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).build();
    // set up the coordinator
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointStorage(new IOExceptionCheckpointStorage()).setTimer(manuallyTriggeredScheduledExecutor).build();
    Arrays.stream(graph.getJobVertex(jobVertexID1).getTaskVertices()).forEach(task -> task.getCurrentExecutionAttempt().markFinished());
    // nothing should be happening
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    checkpointCoordinator.startCheckpointScheduler();
    // trigger the first checkpoint. this should not succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertTrue(checkpointFuture.isCompletedExceptionally());
    // still, nothing should be happening
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    checkpointCoordinator.shutdown();
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) Test(org.junit.Test)

Example 15 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testCheckpointTimeoutIsolated.

@Test
public void testCheckpointTimeoutIsolated() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger a checkpoint, partially acknowledged
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    assertFalse(checkpoint.isDisposed());
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
    // triggers cancelling
    manuallyTriggeredScheduledExecutor.triggerScheduledTasks();
    assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDisposed());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // validate that the received states have been discarded
    verify(subtaskState1, times(1)).discardState();
    // no confirm message must have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(0, gateway.getNotifiedCompletedCheckpoints(attemptId).size());
    }
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Aggregations

CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)46 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)41 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)40 Test (org.junit.Test)37 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)30 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)27 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)22 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)13 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)13 HashSet (java.util.HashSet)12 CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)12 ManuallyTriggeredScheduledExecutor (org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor)10 CompletableFuture (java.util.concurrent.CompletableFuture)9 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)9 HashMap (java.util.HashMap)8 CheckpointCoordinatorConfigurationBuilder (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder)8 List (java.util.List)7 ExecutionException (java.util.concurrent.ExecutionException)7 JobID (org.apache.flink.api.common.JobID)7