use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreFinishedStateWithoutInFlightData.
@Test
public void testRestoreFinishedStateWithoutInFlightData() throws Exception {
// given: Operator with not empty states.
OperatorIDPair op1 = OperatorIDPair.generatedIDOnly(new OperatorID());
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1, singletonList(op1), true).build();
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
operatorStates.put(op1.getGeneratedOperatorID(), new FullyFinishedOperatorState(op1.getGeneratedOperatorID(), 1, 1));
CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
});
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(2).build()).setCompletedCheckpointStore(completedCheckpointStore).build();
ExecutionJobVertex vertex = graph.getJobVertex(jobVertexID);
coord.restoreInitialCheckpointIfPresent(Collections.singleton(vertex));
TaskStateSnapshot restoredState = vertex.getTaskVertices()[0].getCurrentExecutionAttempt().getTaskRestore().getTaskStateSnapshot();
assertTrue(restoredState.isTaskDeployedAsFinished());
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithoutInFlightData.
@Test
public void testRestoreLatestCheckpointedStateWithoutInFlightData() throws Exception {
// given: Operator with not empty states.
final JobVertexID jobVertexID = new JobVertexID();
int parallelism1 = 3;
int maxParallelism1 = 42;
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, parallelism1, maxParallelism1).build();
final ExecutionJobVertex jobVertex = graph.getJobVertex(jobVertexID);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(1).build()).setTimer(manuallyTriggeredScheduledExecutor).build();
// trigger the checkpoint
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(1, coord.getPendingCheckpoints().size());
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
Random random = new Random();
// fill the states and complete the checkpoint.
for (int index = 0; index < jobVertex.getParallelism(); index++) {
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, true)).setManagedKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), false)).setRawKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), true)).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, random))).setResultSubpartitionState(StateObjectCollection.singleton(createNewResultSubpartitionStateHandle(3, random))).build();
TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
assertEquals(1, coord.getSuccessfulCheckpoints().size());
// when: Restore latest checkpoint without in-flight data.
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(jobVertex);
assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
// then: All states should be restored successfully except InputChannel and
// ResultSubpartition which should be ignored.
verifyStateRestore(jobVertexID, jobVertex, keyGroupPartitions1);
for (int i = 0; i < jobVertex.getParallelism(); i++) {
JobManagerTaskRestore taskRestore = jobVertex.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
OperatorSubtaskState operatorState = stateSnapshot.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID));
assertTrue(operatorState.getInputChannelState().isEmpty());
assertTrue(operatorState.getResultSubpartitionState().isEmpty());
assertFalse(operatorState.getRawOperatorState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
assertFalse(operatorState.getRawKeyedState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
}
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testPeriodicTriggering.
@Test
public void testPeriodicTriggering() {
try {
final long start = System.currentTimeMillis();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
10).setCheckpointTimeout(// timeout is very long (200 s)
200000).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
for (int i = 0; i < 5; ++i) {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
checkpointCoordinator.stopCheckpointScheduler();
// no further calls may come.
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
// start another sequence of periodic scheduling
gateway.resetCount();
checkpointCoordinator.startCheckpointScheduler();
for (int i = 0; i < 5; ++i) {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
checkpointCoordinator.stopCheckpointScheduler();
// no further calls may come
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
checkpointCoordinator.shutdown();
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testMinTimeBetweenCheckpointsInterval.
/**
* This test verified that after a completed checkpoint a certain time has passed before another
* is triggered.
*/
@Test
public void testMinTimeBetweenCheckpointsInterval() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
final long delay = 50;
final long checkpointInterval = 12;
CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 12 ms
checkpointInterval).setCheckpointTimeout(// timeout is very long (200 s)
200_000).setMinPauseBetweenCheckpoints(// 50 ms delay between checkpoints
delay).setMaxConcurrentCheckpoints(1).build();
final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
try {
checkpointCoordinator.startCheckpointScheduler();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
// wait until the first checkpoint was triggered
Long firstCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
assertEquals(1L, firstCallId.longValue());
AcknowledgeCheckpoint ackMsg = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 1L);
// tell the coordinator that the checkpoint is done
final long ackTime = System.nanoTime();
checkpointCoordinator.receiveAcknowledgeMessage(ackMsg, TASK_MANAGER_LOCATION_INFO);
gateway.resetCount();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
while (gateway.getTriggeredCheckpoints(attemptID).isEmpty()) {
// sleeps for a while to simulate periodic scheduling
Thread.sleep(checkpointInterval);
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
// wait until the next checkpoint is triggered
Long nextCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
final long nextCheckpointTime = System.nanoTime();
assertEquals(2L, nextCallId.longValue());
final long delayMillis = (nextCheckpointTime - ackTime) / 1_000_000;
// we need to add one ms here to account for rounding errors
if (delayMillis + 1 < delay) {
fail("checkpoint came too early: delay was " + delayMillis + " but should have been at least " + delay);
}
} finally {
checkpointCoordinator.stopCheckpointScheduler();
checkpointCoordinator.shutdown();
}
}
Aggregations