use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorRestoringTest method acknowledgeCheckpoint.
private static void acknowledgeCheckpoint(CheckpointCoordinator coordinator, ExecutionGraph executionGraph, ExecutionJobVertex jobVertex, long checkpointId) throws Exception {
final List<KeyGroupRange> partitions = StateAssignmentOperation.createKeyGroupPartitions(jobVertex.getMaxParallelism(), jobVertex.getParallelism());
for (int partitionIdx = 0; partitionIdx < partitions.size(); partitionIdx++) {
TaskStateSnapshot subtaskState = mockSubtaskState(jobVertex.getJobVertexId(), partitionIdx, partitions.get(partitionIdx));
final AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(executionGraph.getJobID(), jobVertex.getTaskVertices()[partitionIdx].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskState);
coordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testJobGraphModificationsAreCheckedForSavepoint.
@Test
public void testJobGraphModificationsAreCheckedForSavepoint() throws Exception {
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1).build();
CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
File savepointPath = tmpFolder.newFolder();
CompletableFuture<CompletedCheckpoint> savepointFuture = coordinator.triggerSavepoint("file://" + savepointPath.getAbsolutePath(), SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long pendingSavepointId = coordinator.getPendingCheckpoints().keySet().stream().findFirst().get();
coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), graph.getJobVertex(jobVertexID).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId(), pendingSavepointId), "localhost");
assertTrue(savepointFuture.isDone());
BooleanValue checked = new BooleanValue(false);
CheckpointCoordinator restoreCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setVertexFinishedStateCheckerFactory((vertices, states) -> new VertexFinishedStateChecker(vertices, states) {
@Override
public void validateOperatorsFinishedState() {
checked.set(true);
}
}).build();
restoreCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepointFuture.get().getExternalPointer()), graph.getAllVertices(), getClass().getClassLoader());
assertTrue("The finished states should be checked when job is restored on startup", checked.get());
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithoutInFlightData.
@Test
public void testRestoreLatestCheckpointedStateWithoutInFlightData() throws Exception {
// given: Operator with not empty states.
final JobVertexID jobVertexID = new JobVertexID();
int parallelism1 = 3;
int maxParallelism1 = 42;
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, parallelism1, maxParallelism1).build();
final ExecutionJobVertex jobVertex = graph.getJobVertex(jobVertexID);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(1).build()).setTimer(manuallyTriggeredScheduledExecutor).build();
// trigger the checkpoint
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(1, coord.getPendingCheckpoints().size());
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
Random random = new Random();
// fill the states and complete the checkpoint.
for (int index = 0; index < jobVertex.getParallelism(); index++) {
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, true)).setManagedKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), false)).setRawKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), true)).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, random))).setResultSubpartitionState(StateObjectCollection.singleton(createNewResultSubpartitionStateHandle(3, random))).build();
TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
assertEquals(1, coord.getSuccessfulCheckpoints().size());
// when: Restore latest checkpoint without in-flight data.
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(jobVertex);
assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
// then: All states should be restored successfully except InputChannel and
// ResultSubpartition which should be ignored.
verifyStateRestore(jobVertexID, jobVertex, keyGroupPartitions1);
for (int i = 0; i < jobVertex.getParallelism(); i++) {
JobManagerTaskRestore taskRestore = jobVertex.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
OperatorSubtaskState operatorState = stateSnapshot.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID));
assertTrue(operatorState.getInputChannelState().isEmpty());
assertTrue(operatorState.getResultSubpartitionState().isEmpty());
assertFalse(operatorState.getRawOperatorState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
assertFalse(operatorState.getRawKeyedState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
}
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.
/**
* Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
* that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
* checkpointing messages from other jobs should not be touched. A late acknowledge message is
* an acknowledge message which arrives after the checkpoint has been declined.
*
* @throws Exception
*/
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(1).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setTimer(manuallyTriggeredScheduledExecutor).build();
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
long checkpointId = pendingCheckpoint.getCheckpointId();
OperatorID opIDtrigger = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStatesTrigger = spy(new TaskStateSnapshot());
OperatorSubtaskState subtaskStateTrigger = mock(OperatorSubtaskState.class);
taskOperatorSubtaskStatesTrigger.putSubtaskStateByOperatorID(opIDtrigger, subtaskStateTrigger);
// acknowledge the first trigger vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStatesTrigger), TASK_MANAGER_LOCATION_INFO);
// verify that the subtask state has not been discarded
verify(subtaskStateTrigger, never()).discardState();
TaskStateSnapshot unknownSubtaskState = mock(TaskStateSnapshot.class);
// receive an acknowledge message for an unknown vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState, times(1)).discardState();
TaskStateSnapshot differentJobSubtaskState = mock(TaskStateSnapshot.class);
// receive an acknowledge message from an unknown job
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
// duplicate acknowledge message for the trigger vertex
TaskStateSnapshot triggerSubtaskState = mock(TaskStateSnapshot.class);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), triggerSubtaskState), TASK_MANAGER_LOCATION_INFO);
// duplicate acknowledge messages for a known vertex should not trigger discarding the state
verify(triggerSubtaskState, never()).discardState();
// let the checkpoint fail at the first ack vertex
reset(subtaskStateTrigger);
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
assertTrue(pendingCheckpoint.isDisposed());
// check that we've cleaned up the already acknowledged state
verify(subtaskStateTrigger, times(1)).discardState();
TaskStateSnapshot ackSubtaskState = mock(TaskStateSnapshot.class);
// late acknowledge message from the second ack vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), ackSubtaskState), TASK_MANAGER_LOCATION_INFO);
// check that we also cleaned up this state
verify(ackSubtaskState, times(1)).discardState();
// receive an acknowledge message from an unknown job
reset(differentJobSubtaskState);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState), TASK_MANAGER_LOCATION_INFO);
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
TaskStateSnapshot unknownSubtaskState2 = mock(TaskStateSnapshot.class);
// receive an acknowledge message for an unknown vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2), TASK_MANAGER_LOCATION_INFO);
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState2, times(1)).discardState();
}
use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testCompleteCheckpointFailureWithExternallyInducedSource.
@Test
public void testCompleteCheckpointFailureWithExternallyInducedSource() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
taskOperatorSubtaskStates2.putSubtaskStateByOperatorID(opID2, subtaskState2);
// Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
// immediately.
AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
coordCheckpointDone.set(true);
result.complete(new byte[0]);
}).setOperatorID(opID1).build();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).setCheckpointStorage(new JobManagerCheckpointStorage() {
private static final long serialVersionUID = 8134582566514272546L;
// Throw exception when finalizing the checkpoint.
@Override
public CheckpointStorageAccess createCheckpointStorage(JobID jobId) throws IOException {
return new MemoryBackendCheckpointStorageAccess(jobId, null, null, 100) {
@Override
public CheckpointStorageLocation initializeLocationForCheckpoint(long checkpointId) throws IOException {
return new NonPersistentMetadataCheckpointStorageLocation(1000) {
@Override
public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException {
throw new IOException("Artificial Exception");
}
};
}
};
}
}).build();
AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
// Add a master hook which triggers and acks the task checkpoint immediately.
// In this case the task checkpoints would complete before the job master checkpoint
// completes.
checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {
@Override
public String getIdentifier() {
return "anything";
}
@Override
@Nullable
public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
// Acknowledge the checkpoint in the master hooks so the task snapshots
// complete before
// the master state snapshot completes.
checkpointIdRef.set(checkpointId);
AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
return null;
}
@Override
public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
}
@Override
public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
return new SimpleVersionedSerializer<Integer>() {
@Override
public int getVersion() {
return 0;
}
@Override
public byte[] serialize(Integer obj) throws IOException {
return new byte[0];
}
@Override
public Integer deserialize(int version, byte[] serialized) throws IOException {
return 1;
}
};
}
});
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertTrue(checkpointFuture.isCompletedExceptionally());
assertTrue(checkpointCoordinator.getSuccessfulCheckpoints().isEmpty());
}
Aggregations