Search in sources :

Example 21 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.

/**
 * Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
 * that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
 * checkpointing messages from other jobs should not be touched. A late acknowledge message is
 * an acknowledge message which arrives after the checkpoint has been declined.
 *
 * @throws Exception
 */
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(1).build();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    long checkpointId = pendingCheckpoint.getCheckpointId();
    OperatorID opIDtrigger = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStatesTrigger = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskStateTrigger = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStatesTrigger.putSubtaskStateByOperatorID(opIDtrigger, subtaskStateTrigger);
    // acknowledge the first trigger vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStatesTrigger), TASK_MANAGER_LOCATION_INFO);
    // verify that the subtask state has not been discarded
    verify(subtaskStateTrigger, never()).discardState();
    TaskStateSnapshot unknownSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState, times(1)).discardState();
    TaskStateSnapshot differentJobSubtaskState = mock(TaskStateSnapshot.class);
    // receive an acknowledge message from an unknown job
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    // duplicate acknowledge message for the trigger vertex
    TaskStateSnapshot triggerSubtaskState = mock(TaskStateSnapshot.class);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), triggerSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // duplicate acknowledge messages for a known vertex should not trigger discarding the state
    verify(triggerSubtaskState, never()).discardState();
    // let the checkpoint fail at the first ack vertex
    reset(subtaskStateTrigger);
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
    assertTrue(pendingCheckpoint.isDisposed());
    // check that we've cleaned up the already acknowledged state
    verify(subtaskStateTrigger, times(1)).discardState();
    TaskStateSnapshot ackSubtaskState = mock(TaskStateSnapshot.class);
    // late acknowledge message from the second ack vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), ackSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // check that we also cleaned up this state
    verify(ackSubtaskState, times(1)).discardState();
    // receive an acknowledge message from an unknown job
    reset(differentJobSubtaskState);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    TaskStateSnapshot unknownSubtaskState2 = mock(TaskStateSnapshot.class);
    // receive an acknowledge message for an unknown vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2), TASK_MANAGER_LOCATION_INFO);
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState2, times(1)).discardState();
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 22 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testPeriodicTriggering.

@Test
public void testPeriodicTriggering() {
    try {
        final long start = System.currentTimeMillis();
        CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
        JobVertexID jobVertexID = new JobVertexID();
        ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
        ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
        ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
        CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
        10).setCheckpointTimeout(// timeout is very long (200 s)
        200000).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build();
        CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come.
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        // start another sequence of periodic scheduling
        gateway.resetCount();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        checkpointCoordinator.shutdown();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionException(java.util.concurrent.ExecutionException) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 23 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testMinTimeBetweenCheckpointsInterval.

/**
 * This test verified that after a completed checkpoint a certain time has passed before another
 * is triggered.
 */
@Test
public void testMinTimeBetweenCheckpointsInterval() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    final long delay = 50;
    final long checkpointInterval = 12;
    CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 12 ms
    checkpointInterval).setCheckpointTimeout(// timeout is very long (200 s)
    200_000).setMinPauseBetweenCheckpoints(// 50 ms delay between checkpoints
    delay).setMaxConcurrentCheckpoints(1).build();
    final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    try {
        checkpointCoordinator.startCheckpointScheduler();
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        // wait until the first checkpoint was triggered
        Long firstCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
        assertEquals(1L, firstCallId.longValue());
        AcknowledgeCheckpoint ackMsg = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 1L);
        // tell the coordinator that the checkpoint is done
        final long ackTime = System.nanoTime();
        checkpointCoordinator.receiveAcknowledgeMessage(ackMsg, TASK_MANAGER_LOCATION_INFO);
        gateway.resetCount();
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        while (gateway.getTriggeredCheckpoints(attemptID).isEmpty()) {
            // sleeps for a while to simulate periodic scheduling
            Thread.sleep(checkpointInterval);
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        // wait until the next checkpoint is triggered
        Long nextCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
        final long nextCheckpointTime = System.nanoTime();
        assertEquals(2L, nextCallId.longValue());
        final long delayMillis = (nextCheckpointTime - ackTime) / 1_000_000;
        // we need to add one ms here to account for rounding errors
        if (delayMillis + 1 < delay) {
            fail("checkpoint came too early: delay was " + delayMillis + " but should have been at least " + delay);
        }
    } finally {
        checkpointCoordinator.stopCheckpointScheduler();
        checkpointCoordinator.shutdown();
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 24 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class JobMasterStopWithSavepointITCase method setUpJobGraph.

private void setUpJobGraph(final Class<? extends TaskInvokable> invokable, final RestartStrategies.RestartStrategyConfiguration restartStrategy) throws Exception {
    finishingLatch = new OneShotLatch();
    invokeLatch = new CountDownLatch(PARALLELISM);
    numberOfRestarts = new CountDownLatch(2);
    checkpointsToWaitFor = new CountDownLatch(10);
    syncSavepointId.set(-1);
    savepointDirectory = temporaryFolder.newFolder().toPath();
    Assume.assumeTrue("ClusterClient is not an instance of MiniClusterClient", MINI_CLUSTER_RESOURCE.getClusterClient() instanceof MiniClusterClient);
    clusterClient = (MiniClusterClient) MINI_CLUSTER_RESOURCE.getClusterClient();
    final ExecutionConfig config = new ExecutionConfig();
    config.setRestartStrategy(restartStrategy);
    final JobVertex vertex = new JobVertex("testVertex");
    vertex.setInvokableClass(invokable);
    vertex.setParallelism(PARALLELISM);
    final JobCheckpointingSettings jobCheckpointingSettings = new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(CHECKPOINT_INTERVAL, 60_000, 10, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0, 0), null);
    jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().setExecutionConfig(config).addJobVertex(vertex).setJobCheckpointingSettings(jobCheckpointingSettings).build();
    clusterClient.submitJob(jobGraph).get();
    assertTrue(invokeLatch.await(60, TimeUnit.SECONDS));
    waitForJob();
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) CountDownLatch(java.util.concurrent.CountDownLatch) MiniClusterClient(org.apache.flink.client.program.MiniClusterClient)

Example 25 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class JobMasterTriggerSavepointITCase method setUpWithCheckpointInterval.

private void setUpWithCheckpointInterval(long checkpointInterval) throws Exception {
    invokeLatch = new CountDownLatch(1);
    triggerCheckpointLatch = new CountDownLatch(1);
    savepointDirectory = temporaryFolder.newFolder().toPath();
    Assume.assumeTrue("ClusterClient is not an instance of MiniClusterClient", MINI_CLUSTER_RESOURCE.getClusterClient() instanceof MiniClusterClient);
    clusterClient = (MiniClusterClient) MINI_CLUSTER_RESOURCE.getClusterClient();
    final JobVertex vertex = new JobVertex("testVertex");
    vertex.setInvokableClass(NoOpBlockingInvokable.class);
    vertex.setParallelism(1);
    final JobCheckpointingSettings jobCheckpointingSettings = new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(checkpointInterval, 60_000, 10, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0, 0), null);
    jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(vertex).setJobCheckpointingSettings(jobCheckpointingSettings).build();
    clusterClient.submitJob(jobGraph).get();
    assertTrue(invokeLatch.await(60, TimeUnit.SECONDS));
    waitForJob();
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) CountDownLatch(java.util.concurrent.CountDownLatch) MiniClusterClient(org.apache.flink.client.program.MiniClusterClient)

Aggregations

CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)25 JobCheckpointingSettings (org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings)13 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)10 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)10 Test (org.junit.Test)10 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)9 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)8 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)7 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)6 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)6 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)6 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)4 IOException (java.io.IOException)3 JobID (org.apache.flink.api.common.JobID)3 ArrayList (java.util.ArrayList)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 ExecutionException (java.util.concurrent.ExecutionException)2 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)2 Time (org.apache.flink.api.common.time.Time)2 MiniClusterClient (org.apache.flink.client.program.MiniClusterClient)2