use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorTest method testCheckpointTriggeredAfterSomeTasksFinishedIfAllowed.
@Test
public void testCheckpointTriggeredAfterSomeTasksFinishedIfAllowed() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, 3, 256).addJobVertex(jobVertexID2, 3, 256).build();
ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
ExecutionJobVertex jobVertex2 = graph.getJobVertex(jobVertexID2);
jobVertex1.getTaskVertices()[0].getCurrentExecutionAttempt().markFinished();
jobVertex1.getTaskVertices()[1].getCurrentExecutionAttempt().markFinished();
jobVertex2.getTaskVertices()[1].getCurrentExecutionAttempt().markFinished();
CheckpointStatsTracker statsTracker = new CheckpointStatsTracker(Integer.MAX_VALUE, new UnregisteredMetricsGroup());
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).setAllowCheckpointsAfterTasksFinished(true).setCheckpointStatsTracker(statsTracker).build();
// nothing should be happening
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this will not fail because we allow checkpointing even with
// finished tasks
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertFalse(checkpointFuture.isDone());
assertFalse(checkpointFuture.isCompletedExceptionally());
// Triggering should succeed
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
AbstractCheckpointStats checkpointStats = statsTracker.createSnapshot().getHistory().getCheckpointById(pendingCheckpoint.getCheckpointID());
assertEquals(3, checkpointStats.getNumberOfAcknowledgedSubtasks());
for (ExecutionVertex task : Arrays.asList(jobVertex1.getTaskVertices()[0], jobVertex1.getTaskVertices()[1], jobVertex2.getTaskVertices()[1])) {
// those tasks that are already finished are automatically marked as acknowledged
assertNotNull(checkpointStats.getTaskStateStats(task.getJobvertexId()).getSubtaskStats()[task.getParallelSubtaskIndex()]);
}
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorTest method testBaseLocationsNotInitialized.
@Test
public void testBaseLocationsNotInitialized() throws Exception {
File checkpointDir = tmpFolder.newFolder();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTransitToRunning(false).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(Long.MAX_VALUE).build()).setCheckpointStorage(new FsStateBackend(checkpointDir.toURI())).build();
Path jobCheckpointPath = new Path(checkpointDir.getAbsolutePath(), graph.getJobID().toString());
FileSystem fs = FileSystem.get(checkpointDir.toURI());
// directory will not be created if checkpointing is disabled
Assert.assertFalse(fs.exists(jobCheckpointPath));
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorTest method testSharedStateRegistrationOnRestore.
@Test
public void testSharedStateRegistrationOnRestore() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
int parallelism1 = 2;
int maxParallelism1 = 4;
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).build();
ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
List<CompletedCheckpoint> checkpoints = Collections.emptyList();
SharedStateRegistry firstInstance = SharedStateRegistry.DEFAULT_FACTORY.create(org.apache.flink.util.concurrent.Executors.directExecutor(), checkpoints);
final EmbeddedCompletedCheckpointStore store = new EmbeddedCompletedCheckpointStore(10, checkpoints, firstInstance);
// set up the coordinator and validate the initial state
final CheckpointCoordinatorBuilder coordinatorBuilder = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor);
final CheckpointCoordinator coordinator = coordinatorBuilder.setCompletedCheckpointStore(store).build();
final int numCheckpoints = 3;
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
for (int i = 0; i < numCheckpoints; ++i) {
performIncrementalCheckpoint(graph.getJobID(), coordinator, jobVertex1, keyGroupPartitions1, i);
}
List<CompletedCheckpoint> completedCheckpoints = coordinator.getSuccessfulCheckpoints();
assertEquals(numCheckpoints, completedCheckpoints.size());
int sharedHandleCount = 0;
List<Map<StateHandleID, StreamStateHandle>> sharedHandlesByCheckpoint = new ArrayList<>(numCheckpoints);
for (int i = 0; i < numCheckpoints; ++i) {
sharedHandlesByCheckpoint.add(new HashMap<>(2));
}
int cp = 0;
for (CompletedCheckpoint completedCheckpoint : completedCheckpoints) {
for (OperatorState taskState : completedCheckpoint.getOperatorStates().values()) {
for (OperatorSubtaskState subtaskState : taskState.getStates()) {
for (KeyedStateHandle keyedStateHandle : subtaskState.getManagedKeyedState()) {
// test we are once registered with the current registry
verify(keyedStateHandle, times(1)).registerSharedStates(firstInstance, completedCheckpoint.getCheckpointID());
IncrementalRemoteKeyedStateHandle incrementalKeyedStateHandle = (IncrementalRemoteKeyedStateHandle) keyedStateHandle;
sharedHandlesByCheckpoint.get(cp).putAll(incrementalKeyedStateHandle.getSharedState());
for (StreamStateHandle streamStateHandle : incrementalKeyedStateHandle.getSharedState().values()) {
assertTrue(!(streamStateHandle instanceof PlaceholderStreamStateHandle));
verify(streamStateHandle, never()).discardState();
++sharedHandleCount;
}
for (StreamStateHandle streamStateHandle : incrementalKeyedStateHandle.getPrivateState().values()) {
verify(streamStateHandle, never()).discardState();
}
verify(incrementalKeyedStateHandle.getMetaStateHandle(), never()).discardState();
}
verify(subtaskState, never()).discardState();
}
}
++cp;
}
// 2 (parallelism) x (1 (CP0) + 2 (CP1) + 2 (CP2)) = 10
assertEquals(10, sharedHandleCount);
// discard CP0
store.removeOldestCheckpoint();
// CP1
for (Map<StateHandleID, StreamStateHandle> cpList : sharedHandlesByCheckpoint) {
for (StreamStateHandle streamStateHandle : cpList.values()) {
verify(streamStateHandle, never()).discardState();
}
}
// shutdown the store
store.shutdown(JobStatus.SUSPENDED, new CheckpointsCleaner());
// restore the store
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(jobVertex1);
assertEquals(JobStatus.SUSPENDED, store.getShutdownStatus().orElse(null));
SharedStateRegistry secondInstance = SharedStateRegistry.DEFAULT_FACTORY.create(org.apache.flink.util.concurrent.Executors.directExecutor(), store.getAllCheckpoints());
final EmbeddedCompletedCheckpointStore secondStore = new EmbeddedCompletedCheckpointStore(10, store.getAllCheckpoints(), secondInstance);
final CheckpointCoordinator secondCoordinator = coordinatorBuilder.setCompletedCheckpointStore(secondStore).build();
assertTrue(secondCoordinator.restoreLatestCheckpointedStateToAll(tasks, false));
// validate that all shared states are registered again after the recovery.
cp = 0;
for (CompletedCheckpoint completedCheckpoint : completedCheckpoints) {
for (OperatorState taskState : completedCheckpoint.getOperatorStates().values()) {
for (OperatorSubtaskState subtaskState : taskState.getStates()) {
for (KeyedStateHandle keyedStateHandle : subtaskState.getManagedKeyedState()) {
VerificationMode verificationMode;
// test we are once registered with the new registry
if (cp > 0) {
verificationMode = times(1);
} else {
verificationMode = never();
}
// check that all are registered with the new registry
verify(keyedStateHandle, verificationMode).registerSharedStates(secondInstance, completedCheckpoint.getCheckpointID());
}
}
}
++cp;
}
// discard CP1
secondStore.removeOldestCheckpoint();
// we expect that all shared state from CP0 is no longer referenced and discarded. CP2 is
// still live and also
// references the state from CP1, so we expect they are not discarded.
verifyDiscard(sharedHandlesByCheckpoint, cpId -> cpId == 0 ? times(1) : never());
// discard CP2
secondStore.removeOldestCheckpoint();
// still expect shared state not to be discarded because it may be used in later checkpoints
verifyDiscard(sharedHandlesByCheckpoint, cpId -> cpId == 1 ? never() : atLeast(0));
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorTest method testConcurrentSavepoints.
/**
* Tests that the savepoints can be triggered concurrently.
*/
@Test
public void testConcurrentSavepoints() throws Exception {
int numSavepoints = 5;
JobVertexID jobVertexID1 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setMaxConcurrentCheckpoints(// max one checkpoint at a time => should not affect savepoints
1).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCheckpointIDCounter(checkpointIDCounter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
List<CompletableFuture<CompletedCheckpoint>> savepointFutures = new ArrayList<>();
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoints
for (int i = 0; i < numSavepoints; i++) {
savepointFutures.add(checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL));
}
// After triggering multiple savepoints, all should in progress
for (CompletableFuture<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertFalse(savepointFuture.isDone());
}
manuallyTriggeredScheduledExecutor.triggerAll();
// ACK all savepoints
long checkpointId = checkpointIDCounter.getLast();
for (int i = 0; i < numSavepoints; i++, checkpointId--) {
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
}
// After ACKs, all should be completed
for (CompletableFuture<CompletedCheckpoint> savepointFuture : savepointFutures) {
assertNotNull(savepointFuture.get());
}
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorTest method testNotifyCheckpointAbortionInOperatorCoordinator.
@Test
public void testNotifyCheckpointAbortionInOperatorCoordinator() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).build();
ExecutionVertex executionVertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = executionVertex.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorTestingUtils.MockOperatorCoordinatorCheckpointContext context = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOperatorID(new OperatorID()).setOnCallingCheckpointCoordinator((ignored, future) -> future.complete(new byte[0])).build();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(context)).build();
try {
// Trigger checkpoint 1.
checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
long checkpointId1 = Collections.max(checkpointCoordinator.getPendingCheckpoints().keySet());
// Trigger checkpoint 2.
checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
// Acknowledge checkpoint 2. This should abort checkpoint 1.
long checkpointId2 = Collections.max(checkpointCoordinator.getPendingCheckpoints().keySet());
AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, checkpointId2, new CheckpointMetrics(), null);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, "");
// OperatorCoordinator should have been notified of the abortion of checkpoint 1.
assertEquals(Collections.singletonList(1L), context.getAbortedCheckpoints());
assertEquals(Collections.singletonList(2L), context.getCompletedCheckpoints());
} finally {
checkpointCoordinator.shutdown();
}
}
Aggregations