use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.
/**
* Triggers a savepoint and two checkpoints. The second checkpoint completes and subsumes the
* first checkpoint, but not the first savepoint. Then we trigger another checkpoint and
* savepoint. The 2nd savepoint completes and subsumes the last checkpoint, but not the first
* savepoint.
*/
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = spy(new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCheckpointIDCounter(counter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(1)).setTimer(manuallyTriggeredScheduledExecutor).build());
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoint and checkpoint
CompletableFuture<CompletedCheckpoint> savepointFuture1 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long savepointId1 = counter.getLast();
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
long checkpointId2 = counter.getLast();
assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
// no completed checkpoint before checkpointId2.
verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId2), anyLong(), eq(INVALID_CHECKPOINT_ID));
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
assertFalse(savepointFuture1.isDone());
CompletableFuture<CompletedCheckpoint> checkpointFuture3 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture3);
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
CompletableFuture<CompletedCheckpoint> savepointFuture2 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long savepointId2 = counter.getLast();
FutureUtils.throwIfCompletedExceptionally(savepointFuture2);
assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
// savepoints should not subsume checkpoints
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);
// we do not send notify checkpoint complete for savepoints
verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId2), anyLong(), anyLong());
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
assertFalse(savepointFuture1.isDone());
assertNotNull(savepointFuture2.get());
// Ack first savepoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);
// we do not send notify checkpoint complete for savepoints
verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId1), anyLong(), anyLong());
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertNotNull(savepointFuture1.get());
CompletableFuture<CompletedCheckpoint> checkpointFuture4 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture4);
long checkpointId4 = counter.getLast();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId4), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId4), TASK_MANAGER_LOCATION_INFO);
// checkpoint2 would be subsumed.
verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId4), anyLong(), eq(checkpointId2));
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggeringFullCheckpoints.
@Test
public void testTriggeringFullCheckpoints() throws Exception {
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// create a savepoint, we can restore from later
final CompletedCheckpoint savepoint = takeSavepoint(graph, attemptID);
// restore from a savepoint in NO_CLAIM mode
final StandaloneCompletedCheckpointStore checkpointStore = new StandaloneCompletedCheckpointStore(1);
final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
checkpointCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepoint.getExternalPointer(), true, RestoreMode.NO_CLAIM), graph.getAllVertices(), this.getClass().getClassLoader());
// trigger a savepoint before any checkpoint completes
// next triggered checkpoint should still be a full one
takeSavepoint(graph, attemptID, checkpointCoordinator, 2);
checkpointCoordinator.startCheckpointScheduler();
gateway.resetCount();
// the checkpoint should be a FULL_CHECKPOINT
final CompletableFuture<CompletedCheckpoint> checkpoint = checkpointCoordinator.triggerCheckpoint(true);
manuallyTriggeredScheduledExecutor.triggerAll();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 3), TASK_MANAGER_LOCATION_INFO);
checkpoint.get();
assertThat(gateway.getOnlyTriggeredCheckpoint(attemptID).checkpointOptions.getCheckpointType(), is(CheckpointType.FULL_CHECKPOINT));
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointSnapshotMasterHookFailed.
@Test
public void testTriggerCheckpointSnapshotMasterHookFailed() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator();
final CompletableFuture<String> masterHookCheckpointFuture = new CompletableFuture<>();
checkpointCoordinator.addMasterHook(new TestingMasterHook(masterHookCheckpointFuture));
checkpointCoordinator.startCheckpointScheduler();
final CompletableFuture<CompletedCheckpoint> onCompletionPromise = triggerPeriodicCheckpoint(checkpointCoordinator);
// checkpoint trigger will not finish since master hook checkpoint is not finished yet
manuallyTriggeredScheduledExecutor.triggerAll();
assertTrue(checkpointCoordinator.isTriggering());
// continue triggering
masterHookCheckpointFuture.completeExceptionally(new Exception("by design"));
manuallyTriggeredScheduledExecutor.triggerAll();
assertFalse(checkpointCoordinator.isTriggering());
try {
onCompletionPromise.get();
fail("Should not reach here");
} catch (ExecutionException e) {
final Optional<CheckpointException> checkpointExceptionOptional = ExceptionUtils.findThrowable(e, CheckpointException.class);
assertTrue(checkpointExceptionOptional.isPresent());
assertEquals(CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, checkpointExceptionOptional.get().getCheckpointFailureReason());
}
// it doesn't really trigger task manager to do checkpoint
assertEquals(0, gateway.getTriggeredCheckpoints(attemptID).size());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointRequestQueuedWithFailure.
@Test
public void testTriggerCheckpointRequestQueuedWithFailure() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(new UnstableCheckpointIDCounter(id -> id == 0)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
// start a periodic checkpoint first
final CompletableFuture<CompletedCheckpoint> onCompletionPromise1 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
assertTrue(checkpointCoordinator.isTriggering());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
// another trigger before the prior one finished
final CompletableFuture<CompletedCheckpoint> onCompletionPromise2 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
// another trigger before the first one finished
final CompletableFuture<CompletedCheckpoint> onCompletionPromise3 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
assertTrue(checkpointCoordinator.isTriggering());
assertEquals(2, checkpointCoordinator.getTriggerRequestQueue().size());
manuallyTriggeredScheduledExecutor.triggerAll();
// the first triggered checkpoint fails by design through UnstableCheckpointIDCounter
assertTrue(onCompletionPromise1.isCompletedExceptionally());
assertFalse(onCompletionPromise2.isCompletedExceptionally());
assertFalse(onCompletionPromise3.isCompletedExceptionally());
assertFalse(checkpointCoordinator.isTriggering());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
assertEquals(2, gateway.getTriggeredCheckpoints(attemptID).size());
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class DefaultCheckpointPlanCalculatorTest method runWithNotRunningTask.
private void runWithNotRunningTask(boolean isRunningVertexSource, boolean isNotRunningVertexSource) throws Exception {
for (ExecutionState notRunningState : complementOf(EnumSet.of(ExecutionState.RUNNING))) {
JobVertexID runningVertex = new JobVertexID();
JobVertexID notRunningVertex = new JobVertexID();
ExecutionGraph graph = new CheckpointExecutionGraphBuilder().addJobVertex(runningVertex, isRunningVertexSource).addJobVertex(notRunningVertex, isNotRunningVertexSource).setTransitToRunning(false).build();
// The first vertex is always RUNNING.
transitVertexToState(graph, runningVertex, ExecutionState.RUNNING);
// The second vertex is everything except RUNNING.
transitVertexToState(graph, notRunningVertex, notRunningState);
DefaultCheckpointPlanCalculator checkpointPlanCalculator = createCheckpointPlanCalculator(graph);
try {
checkpointPlanCalculator.calculateCheckpointPlan().get();
fail("The computation should fail since some tasks to trigger are in " + notRunningState + " state");
} catch (ExecutionException e) {
Throwable cause = e.getCause();
assertThat(cause, instanceOf(CheckpointException.class));
assertEquals(CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING, ((CheckpointException) cause).getCheckpointFailureReason());
}
}
}
Aggregations