use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class ActorGatewayCheckpointResponder method acknowledgeCheckpoint.
@Override
public void acknowledgeCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, CheckpointMetrics checkpointMetrics, SubtaskState checkpointStateHandles) {
AcknowledgeCheckpoint message = new AcknowledgeCheckpoint(jobID, executionAttemptID, checkpointId, checkpointMetrics, checkpointStateHandles);
actorGateway.tell(message);
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointStateRestoreTest method testSetState.
/**
* Tests that on restore the task state is reset for each stateful task.
*/
@Test
public void testSetState() {
try {
KeyGroupRange keyGroupRange = KeyGroupRange.of(0, 0);
List<SerializableObject> testStates = Collections.singletonList(new SerializableObject());
final KeyedStateHandle serializedKeyGroupStates = CheckpointCoordinatorTestingUtils.generateKeyGroupState(keyGroupRange, testStates);
final JobVertexID statefulId = new JobVertexID();
final JobVertexID statelessId = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(statefulId, 3, 256).addJobVertex(statelessId, 2, 256).build();
ExecutionJobVertex stateful = graph.getJobVertex(statefulId);
ExecutionJobVertex stateless = graph.getJobVertex(statelessId);
ExecutionVertex stateful1 = stateful.getTaskVertices()[0];
ExecutionVertex stateful2 = stateful.getTaskVertices()[1];
ExecutionVertex stateful3 = stateful.getTaskVertices()[2];
ExecutionVertex stateless1 = stateless.getTaskVertices()[0];
ExecutionVertex stateless2 = stateless.getTaskVertices()[1];
Execution statefulExec1 = stateful1.getCurrentExecutionAttempt();
Execution statefulExec2 = stateful2.getCurrentExecutionAttempt();
Execution statefulExec3 = stateful3.getCurrentExecutionAttempt();
Execution statelessExec1 = stateless1.getCurrentExecutionAttempt();
Execution statelessExec2 = stateless2.getCurrentExecutionAttempt();
ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
// create ourselves a checkpoint with state
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
PendingCheckpoint pending = coord.getPendingCheckpoints().values().iterator().next();
final long checkpointId = pending.getCheckpointId();
final TaskStateSnapshot subtaskStates = new TaskStateSnapshot();
subtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(statefulId), OperatorSubtaskState.builder().setManagedKeyedState(serializedKeyGroupStates).build());
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), statefulExec1.getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskStates), TASK_MANAGER_LOCATION_INFO);
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), statefulExec2.getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskStates), TASK_MANAGER_LOCATION_INFO);
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), statefulExec3.getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskStates), TASK_MANAGER_LOCATION_INFO);
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), statelessExec1.getAttemptId(), checkpointId), TASK_MANAGER_LOCATION_INFO);
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), statelessExec2.getAttemptId(), checkpointId), TASK_MANAGER_LOCATION_INFO);
assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
// let the coordinator inject the state
assertTrue(coord.restoreLatestCheckpointedStateToAll(new HashSet<>(Arrays.asList(stateful, stateless)), false));
// verify that each stateful vertex got the state
assertEquals(subtaskStates, statefulExec1.getTaskRestore().getTaskStateSnapshot());
assertEquals(subtaskStates, statefulExec2.getTaskRestore().getTaskStateSnapshot());
assertEquals(subtaskStates, statefulExec3.getTaskRestore().getTaskStateSnapshot());
assertNull(statelessExec1.getTaskRestore());
assertNull(statelessExec2.getTaskRestore());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndConfirmSimpleCheckpoint.
@Test
public void testTriggerAndConfirmSimpleCheckpoint() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph);
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
// validate that we have a pending checkpoint
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(1, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
assertNotNull(checkpoint);
assertEquals(checkpointId, checkpoint.getCheckpointId());
assertEquals(graph.getJobID(), checkpoint.getJobId());
assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint.getOperatorStates().size());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState2 = mock(OperatorSubtaskState.class);
TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot(singletonMap(opID1, subtaskState1));
TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot(singletonMap(opID2, subtaskState2));
// acknowledge from one of the tasks
AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
verify(subtaskState2, times(1)).registerSharedStates(any(SharedStateRegistry.class), eq(checkpointId));
// acknowledge the same task again (should not matter)
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
verify(subtaskState2, times(2)).registerSharedStates(any(SharedStateRegistry.class), eq(checkpointId));
// acknowledge the other task.
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
// the checkpoint is internally converted to a successful checkpoint and the
// pending checkpoint object is disposed
assertTrue(checkpoint.isDisposed());
// the now we should have a completed checkpoint
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
// the canceler should be removed now
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// validate that the subtasks states have registered their shared states.
{
verify(subtaskState1, times(1)).registerSharedStates(any(SharedStateRegistry.class), eq(checkpointId));
verify(subtaskState2, times(2)).registerSharedStates(any(SharedStateRegistry.class), eq(checkpointId));
}
// validate that the relevant tasks got a confirmation message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
}
CompletedCheckpoint success = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
assertEquals(graph.getJobID(), success.getJobId());
assertEquals(checkpoint.getCheckpointId(), success.getCheckpointID());
assertEquals(2, success.getOperatorStates().size());
// ---------------
// trigger another checkpoint and see that this one replaces the other checkpoint
// ---------------
gateway.resetCount();
checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
long checkpointIdNew = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointIdNew), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointIdNew), TASK_MANAGER_LOCATION_INFO);
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
CompletedCheckpoint successNew = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
assertEquals(graph.getJobID(), successNew.getJobId());
assertEquals(checkpointIdNew, successNew.getCheckpointID());
assertEquals(2, successNew.getOperatorStates().size());
assertTrue(successNew.getOperatorStates().values().stream().allMatch(this::hasNoSubState));
// validate that the relevant tasks got a confirmation message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointIdNew, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
assertEquals(checkpointIdNew, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
}
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testReportLatestCompletedCheckpointIdWithAbort.
@Test
public void testReportLatestCompletedCheckpointIdWithAbort() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTransitToRunning(false).build();
ExecutionVertex task = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
AtomicLong reportedCheckpointId = new AtomicLong(-1);
LogicalSlot slot = new TestingLogicalSlotBuilder().setTaskManagerGateway(new SimpleAckingTaskManagerGateway() {
@Override
public void notifyCheckpointAborted(ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long latestCompletedCheckpointId, long timestamp) {
reportedCheckpointId.set(latestCompletedCheckpointId);
}
}).createTestingLogicalSlot();
ExecutionGraphTestUtils.setVertexResource(task, slot);
task.getCurrentExecutionAttempt().transitionState(ExecutionState.RUNNING);
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).setAllowCheckpointsAfterTasksFinished(true).build();
// Trigger a successful checkpoint
CompletableFuture<CompletedCheckpoint> result = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
long completedCheckpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), task.getCurrentExecutionAttempt().getAttemptId(), completedCheckpointId, new CheckpointMetrics(), new TaskStateSnapshot()), "localhost");
assertTrue(result.isDone());
assertFalse(result.isCompletedExceptionally());
result = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
long abortedCheckpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), task.getCurrentExecutionAttempt().getAttemptId(), abortedCheckpointId, new CheckpointException(CHECKPOINT_EXPIRED)), "localhost");
assertTrue(result.isCompletedExceptionally());
assertEquals(completedCheckpointId, reportedCheckpointId.get());
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testConcurrentSavepoints.
/**
* Tests that the savepoints can be triggered concurrently.
*/
@Test
public void testConcurrentSavepoints() throws Exception {
int numSavepoints = 5;
JobVertexID jobVertexID1 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(// max one checkpoint at a time => should not affect savepoints
1).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCheckpointIDCounter(checkpointIDCounter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
List<CompletableFuture<CompletedCheckpoint>> savepointFutures = new ArrayList<>();
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoints
for (int i = 0; i < numSavepoints; i++) {
savepointFutures.add(checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL));
}
// After triggering multiple savepoints, all should in progress
for (CompletableFuture<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertFalse(savepointFuture.isDone());
}
manuallyTriggeredScheduledExecutor.triggerAll();
// ACK all savepoints
long checkpointId = checkpointIDCounter.getLast();
for (int i = 0; i < numSavepoints; i++, checkpointId--) {
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
}
// After ACKs, all should be completed
for (CompletableFuture<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertNotNull(savepointFuture.get());
}
}
Aggregations