use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.
the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.
/**
* Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
* that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
* checkpointing messages from other jobs should not be touched. A late acknowledge message is
* an acknowledge message which arrives after the checkpoint has been declined.
*
* @throws Exception
*/
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(1).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
long checkpointId = pendingCheckpoint.getCheckpointId();
OperatorID opIDtrigger = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStatesTrigger = spy(new TaskStateSnapshot());
OperatorSubtaskState subtaskStateTrigger = mock(OperatorSubtaskState.class);
taskOperatorSubtaskStatesTrigger.putSubtaskStateByOperatorID(opIDtrigger, subtaskStateTrigger);
// acknowledge the first trigger vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStatesTrigger), TASK_MANAGER_LOCATION_INFO);
// verify that the subtask state has not been discarded
verify(subtaskStateTrigger, never()).discardState();
TaskStateSnapshot unknownSubtaskState = mock(TaskStateSnapshot.class);
// receive an acknowledge message for an unknown vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState, times(1)).discardState();
TaskStateSnapshot differentJobSubtaskState = mock(TaskStateSnapshot.class);
// receive an acknowledge message from an unknown job
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
// duplicate acknowledge message for the trigger vertex
TaskStateSnapshot triggerSubtaskState = mock(TaskStateSnapshot.class);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), triggerSubtaskState), TASK_MANAGER_LOCATION_INFO);
// duplicate acknowledge messages for a known vertex should not trigger discarding the state
verify(triggerSubtaskState, never()).discardState();
// let the checkpoint fail at the first ack vertex
reset(subtaskStateTrigger);
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
assertTrue(pendingCheckpoint.isDisposed());
// check that we've cleaned up the already acknowledged state
verify(subtaskStateTrigger, times(1)).discardState();
TaskStateSnapshot ackSubtaskState = mock(TaskStateSnapshot.class);
// late acknowledge message from the second ack vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), ackSubtaskState), TASK_MANAGER_LOCATION_INFO);
// check that we also cleaned up this state
verify(ackSubtaskState, times(1)).discardState();
// receive an acknowledge message from an unknown job
reset(differentJobSubtaskState);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
TaskStateSnapshot unknownSubtaskState2 = mock(TaskStateSnapshot.class);
// receive an acknowledge message for an unknown vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2), TASK_MANAGER_LOCATION_INFO);
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState2, times(1)).discardState();
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testPeriodicTriggering.
@Test
public void testPeriodicTriggering() {
try {
final long start = System.currentTimeMillis();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
10).setCheckpointTimeout(// timeout is very long (200 s)
200000).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
for (int i = 0; i < 5; ++i) {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
checkpointCoordinator.stopCheckpointScheduler();
// no further calls may come.
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
// start another sequence of periodic scheduling
gateway.resetCount();
checkpointCoordinator.startCheckpointScheduler();
for (int i = 0; i < 5; ++i) {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
checkpointCoordinator.stopCheckpointScheduler();
// no further calls may come
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
checkpointCoordinator.shutdown();
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testMinTimeBetweenCheckpointsInterval.
/**
* This test verified that after a completed checkpoint a certain time has passed before another
* is triggered.
*/
@Test
public void testMinTimeBetweenCheckpointsInterval() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
final long delay = 50;
final long checkpointInterval = 12;
CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 12 ms
checkpointInterval).setCheckpointTimeout(// timeout is very long (200 s)
200_000).setMinPauseBetweenCheckpoints(// 50 ms delay between checkpoints
delay).setMaxConcurrentCheckpoints(1).build();
final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
try {
checkpointCoordinator.startCheckpointScheduler();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
// wait until the first checkpoint was triggered
Long firstCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
assertEquals(1L, firstCallId.longValue());
AcknowledgeCheckpoint ackMsg = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 1L);
// tell the coordinator that the checkpoint is done
final long ackTime = System.nanoTime();
checkpointCoordinator.receiveAcknowledgeMessage(ackMsg, TASK_MANAGER_LOCATION_INFO);
gateway.resetCount();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
while (gateway.getTriggeredCheckpoints(attemptID).isEmpty()) {
// sleeps for a while to simulate periodic scheduling
Thread.sleep(checkpointInterval);
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
// wait until the next checkpoint is triggered
Long nextCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
final long nextCheckpointTime = System.nanoTime();
assertEquals(2L, nextCallId.longValue());
final long delayMillis = (nextCheckpointTime - ackTime) / 1_000_000;
// we need to add one ms here to account for rounding errors
if (delayMillis + 1 < delay) {
fail("checkpoint came too early: delay was " + delayMillis + " but should have been at least " + delay);
}
} finally {
checkpointCoordinator.stopCheckpointScheduler();
checkpointCoordinator.shutdown();
}
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.
the class JobMasterStopWithSavepointITCase method setUpJobGraph.
private void setUpJobGraph(final Class<? extends TaskInvokable> invokable, final RestartStrategies.RestartStrategyConfiguration restartStrategy) throws Exception {
finishingLatch = new OneShotLatch();
invokeLatch = new CountDownLatch(PARALLELISM);
numberOfRestarts = new CountDownLatch(2);
checkpointsToWaitFor = new CountDownLatch(10);
syncSavepointId.set(-1);
savepointDirectory = temporaryFolder.newFolder().toPath();
Assume.assumeTrue("ClusterClient is not an instance of MiniClusterClient", MINI_CLUSTER_RESOURCE.getClusterClient() instanceof MiniClusterClient);
clusterClient = (MiniClusterClient) MINI_CLUSTER_RESOURCE.getClusterClient();
final ExecutionConfig config = new ExecutionConfig();
config.setRestartStrategy(restartStrategy);
final JobVertex vertex = new JobVertex("testVertex");
vertex.setInvokableClass(invokable);
vertex.setParallelism(PARALLELISM);
final JobCheckpointingSettings jobCheckpointingSettings = new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(CHECKPOINT_INTERVAL, 60_000, 10, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0, 0), null);
jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().setExecutionConfig(config).addJobVertex(vertex).setJobCheckpointingSettings(jobCheckpointingSettings).build();
clusterClient.submitJob(jobGraph).get();
assertTrue(invokeLatch.await(60, TimeUnit.SECONDS));
waitForJob();
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.
the class JobMasterTriggerSavepointITCase method setUpWithCheckpointInterval.
private void setUpWithCheckpointInterval(long checkpointInterval) throws Exception {
invokeLatch = new CountDownLatch(1);
triggerCheckpointLatch = new CountDownLatch(1);
savepointDirectory = temporaryFolder.newFolder().toPath();
Assume.assumeTrue("ClusterClient is not an instance of MiniClusterClient", MINI_CLUSTER_RESOURCE.getClusterClient() instanceof MiniClusterClient);
clusterClient = (MiniClusterClient) MINI_CLUSTER_RESOURCE.getClusterClient();
final JobVertex vertex = new JobVertex("testVertex");
vertex.setInvokableClass(NoOpBlockingInvokable.class);
vertex.setParallelism(1);
final JobCheckpointingSettings jobCheckpointingSettings = new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(checkpointInterval, 60_000, 10, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0, 0), null);
jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(vertex).setJobCheckpointingSettings(jobCheckpointingSettings).build();
clusterClient.submitJob(jobGraph).get();
assertTrue(invokeLatch.await(60, TimeUnit.SECONDS));
waitForJob();
}
Aggregations