use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testSuccessfulCheckpointSubsumesUnsuccessful.
@Test
public void testSuccessfulCheckpointSubsumesUnsuccessful() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
JobVertexID jobVertexID3 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).addJobVertex(jobVertexID3, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionVertex vertex3 = graph.getJobVertex(jobVertexID3).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID3 = vertex3.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(10);
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending1 = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
long checkpointId1 = pending1.getCheckpointId();
// trigger messages should have been sent
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId1, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorID opID3 = vertex3.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStates11 = spy(new TaskStateSnapshot());
TaskStateSnapshot taskOperatorSubtaskStates12 = spy(new TaskStateSnapshot());
TaskStateSnapshot taskOperatorSubtaskStates13 = spy(new TaskStateSnapshot());
OperatorSubtaskState subtaskState11 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState12 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState13 = mock(OperatorSubtaskState.class);
taskOperatorSubtaskStates11.putSubtaskStateByOperatorID(opID1, subtaskState11);
taskOperatorSubtaskStates12.putSubtaskStateByOperatorID(opID2, subtaskState12);
taskOperatorSubtaskStates13.putSubtaskStateByOperatorID(opID3, subtaskState13);
// acknowledge one of the three tasks
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates12), TASK_MANAGER_LOCATION_INFO);
// start the second checkpoint
gateway.resetCount();
final CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending2;
{
Iterator<PendingCheckpoint> all = checkpointCoordinator.getPendingCheckpoints().values().iterator();
PendingCheckpoint cc1 = all.next();
PendingCheckpoint cc2 = all.next();
pending2 = pending1 == cc1 ? cc2 : cc1;
}
long checkpointId2 = pending2.getCheckpointId();
TaskStateSnapshot taskOperatorSubtaskStates21 = spy(new TaskStateSnapshot());
TaskStateSnapshot taskOperatorSubtaskStates22 = spy(new TaskStateSnapshot());
TaskStateSnapshot taskOperatorSubtaskStates23 = spy(new TaskStateSnapshot());
OperatorSubtaskState subtaskState21 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState22 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState23 = mock(OperatorSubtaskState.class);
taskOperatorSubtaskStates21.putSubtaskStateByOperatorID(opID1, subtaskState21);
taskOperatorSubtaskStates22.putSubtaskStateByOperatorID(opID2, subtaskState22);
taskOperatorSubtaskStates23.putSubtaskStateByOperatorID(opID3, subtaskState23);
// trigger messages should have been sent
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId2, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
// we acknowledge one more task from the first checkpoint and the second
// checkpoint completely. The second checkpoint should then subsume the first checkpoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates23), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates21), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates11), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2, new CheckpointMetrics(), taskOperatorSubtaskStates22), TASK_MANAGER_LOCATION_INFO);
// now, the second checkpoint should be confirmed, and the first discarded
// actually both pending checkpoints are discarded, and the second has been transformed
// into a successful checkpoint
assertTrue(pending1.isDisposed());
assertTrue(pending2.isDisposed());
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// validate that all received subtask states in the first checkpoint have been discarded
verify(subtaskState11, times(1)).discardState();
verify(subtaskState12, times(1)).discardState();
// validate that all subtask states in the second checkpoint are not discarded
verify(subtaskState21, never()).discardState();
verify(subtaskState22, never()).discardState();
verify(subtaskState23, never()).discardState();
// validate the committed checkpoints
List<CompletedCheckpoint> scs = checkpointCoordinator.getSuccessfulCheckpoints();
CompletedCheckpoint success = scs.get(0);
assertEquals(checkpointId2, success.getCheckpointID());
assertEquals(graph.getJobID(), success.getJobId());
assertEquals(3, success.getOperatorStates().size());
// the first confirm message should be out
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId2, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
}
// send the last remaining ack for the first checkpoint. This should not do anything
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId1, new CheckpointMetrics(), taskOperatorSubtaskStates13), TASK_MANAGER_LOCATION_INFO);
verify(subtaskState13, times(1)).discardState();
checkpointCoordinator.shutdown();
completedCheckpointStore.shutdown(JobStatus.FINISHED, new CheckpointsCleaner());
// validate that the states in the second checkpoint have been discarded
verify(subtaskState21, times(1)).discardState();
verify(subtaskState22, times(1)).discardState();
verify(subtaskState23, times(1)).discardState();
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointThenFailureManagerThrowsException.
@Test
public void testTriggerAndDeclineCheckpointThenFailureManagerThrowsException() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
final ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
final ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
final String errorMsg = "Exceeded checkpoint failure tolerance number!";
CheckpointFailureManager checkpointFailureManager = getCheckpointFailureManager(errorMsg);
// set up the coordinator
CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph, checkpointFailureManager);
try {
// trigger the checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkPointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkPointFuture);
long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
// acknowledge from one of the tasks
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), TASK_MANAGER_LOCATION_INFO);
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// decline checkpoint from the other task
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
fail("Test failed.");
} catch (Exception e) {
ExceptionUtils.assertThrowableWithMessage(e, errorMsg);
} finally {
checkpointCoordinator.shutdown();
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggeringFullSnapshotAfterJobmasterFailover.
@Test
public void testTriggeringFullSnapshotAfterJobmasterFailover() throws Exception {
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// create a savepoint, we can restore from later
final CompletedCheckpoint savepoint = takeSavepoint(graph, attemptID);
// restore from a savepoint in NO_CLAIM mode
final StandaloneCompletedCheckpointStore checkpointStore = new StandaloneCompletedCheckpointStore(1);
final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
checkpointCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepoint.getExternalPointer(), true, RestoreMode.NO_CLAIM), graph.getAllVertices(), this.getClass().getClassLoader());
checkpointCoordinator.shutdown();
// imitate job manager failover
gateway.resetCount();
checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
checkpointCoordinator.restoreLatestCheckpointedStateToAll(new HashSet<>(graph.getAllVertices().values()), true);
checkpointCoordinator.startCheckpointScheduler();
final CompletableFuture<CompletedCheckpoint> checkpoint = checkpointCoordinator.triggerCheckpoint(true);
manuallyTriggeredScheduledExecutor.triggerAll();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 2), TASK_MANAGER_LOCATION_INFO);
checkpoint.get();
assertThat(gateway.getOnlyTriggeredCheckpoint(attemptID).checkpointOptions.getCheckpointType(), is(CheckpointType.FULL_CHECKPOINT));
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testMinTimeBetweenCheckpointsInterval.
/**
* This test verified that after a completed checkpoint a certain time has passed before another
* is triggered.
*/
@Test
public void testMinTimeBetweenCheckpointsInterval() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
final long delay = 50;
final long checkpointInterval = 12;
CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 12 ms
checkpointInterval).setCheckpointTimeout(// timeout is very long (200 s)
200_000).setMinPauseBetweenCheckpoints(// 50 ms delay between checkpoints
delay).setMaxConcurrentCheckpoints(1).build();
final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
try {
checkpointCoordinator.startCheckpointScheduler();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
// wait until the first checkpoint was triggered
Long firstCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
assertEquals(1L, firstCallId.longValue());
AcknowledgeCheckpoint ackMsg = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 1L);
// tell the coordinator that the checkpoint is done
final long ackTime = System.nanoTime();
checkpointCoordinator.receiveAcknowledgeMessage(ackMsg, TASK_MANAGER_LOCATION_INFO);
gateway.resetCount();
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
while (gateway.getTriggeredCheckpoints(attemptID).isEmpty()) {
// sleeps for a while to simulate periodic scheduling
Thread.sleep(checkpointInterval);
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
// wait until the next checkpoint is triggered
Long nextCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
final long nextCheckpointTime = System.nanoTime();
assertEquals(2L, nextCallId.longValue());
final long delayMillis = (nextCheckpointTime - ackTime) / 1_000_000;
// we need to add one ms here to account for rounding errors
if (delayMillis + 1 < delay) {
fail("checkpoint came too early: delay was " + delayMillis + " but should have been at least " + delay);
}
} finally {
checkpointCoordinator.stopCheckpointScheduler();
checkpointCoordinator.shutdown();
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorMasterHooksTest method testHooksAreCalledOnTrigger.
// ------------------------------------------------------------------------
// trigger / restore behavior
// ------------------------------------------------------------------------
@Test
public void testHooksAreCalledOnTrigger() throws Exception {
final String id1 = "id1";
final String id2 = "id2";
final String state1 = "the-test-string-state";
final byte[] state1serialized = new StringSerializer().serialize(state1);
final long state2 = 987654321L;
final byte[] state2serialized = new LongSerializer().serialize(state2);
final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
when(statefulHook1.getIdentifier()).thenReturn(id1);
when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class))).thenReturn(CompletableFuture.completedFuture(state1));
final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
when(statefulHook2.getIdentifier()).thenReturn(id2);
when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class))).thenReturn(CompletableFuture.completedFuture(state2));
final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
when(statelessHook.getIdentifier()).thenReturn("some-id");
// create the checkpoint coordinator
JobVertexID jobVertexId = new JobVertexID();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).build();
final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
final CheckpointCoordinator cc = instantiateCheckpointCoordinator(graph, manuallyTriggeredScheduledExecutor);
cc.addMasterHook(statefulHook1);
cc.addMasterHook(statelessHook);
cc.addMasterHook(statefulHook2);
// trigger a checkpoint
final CompletableFuture<CompletedCheckpoint> checkpointFuture = cc.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertFalse(checkpointFuture.isCompletedExceptionally());
assertEquals(1, cc.getNumberOfPendingCheckpoints());
verify(statefulHook1, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
verify(statefulHook2, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
verify(statelessHook, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
ExecutionAttemptID attemptID = graph.getJobVertex(jobVertexId).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId();
final long checkpointId = cc.getPendingCheckpoints().values().iterator().next().getCheckpointId();
cc.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, checkpointId), "Unknown location");
assertEquals(0, cc.getNumberOfPendingCheckpoints());
assertEquals(1, cc.getNumberOfRetainedSuccessfulCheckpoints());
final CompletedCheckpoint chk = cc.getCheckpointStore().getLatestCheckpoint();
final Collection<MasterState> masterStates = chk.getMasterHookStates();
assertEquals(2, masterStates.size());
for (MasterState ms : masterStates) {
if (ms.name().equals(id1)) {
assertArrayEquals(state1serialized, ms.bytes());
assertEquals(StringSerializer.VERSION, ms.version());
} else if (ms.name().equals(id2)) {
assertArrayEquals(state2serialized, ms.bytes());
assertEquals(LongSerializer.VERSION, ms.version());
} else {
fail("unrecognized state name: " + ms.name());
}
}
}
Aggregations