use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testJobGraphModificationsAreCheckedForInitialCheckpoint.
@Test
public void testJobGraphModificationsAreCheckedForInitialCheckpoint() throws Exception {
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1).build();
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, Collections.emptyMap(), Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
});
BooleanValue checked = new BooleanValue(false);
CheckpointCoordinator restoreCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setVertexFinishedStateCheckerFactory((vertices, states) -> new VertexFinishedStateChecker(vertices, states) {
@Override
public void validateOperatorsFinishedState() {
checked.set(true);
}
}).build();
restoreCoordinator.restoreInitialCheckpointIfPresent(new HashSet<>(graph.getAllVertices().values()));
assertTrue("The finished states should be checked when job is restored on startup", checked.get());
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTest method testMaxConcurrentAttemptsWithSubsumption.
@Test
public void testMaxConcurrentAttemptsWithSubsumption() throws Exception {
final int maxConcurrentAttempts = 2;
JobVertexID jobVertexID1 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
10).setCheckpointTimeout(// timeout is very long (200 s)
200000).setMinPauseBetweenCheckpoints(// no extra delay
0L).setMaxConcurrentCheckpoints(maxConcurrentAttempts).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
do {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
} while (checkpointCoordinator.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);
// validate that the pending checkpoints are there
assertEquals(maxConcurrentAttempts, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(1L));
assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(2L));
// now we acknowledge the second checkpoint, which should subsume the first checkpoint
// and allow two more checkpoints to be triggered
// now, once we acknowledge one checkpoint, it should trigger the next one
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 2L), TASK_MANAGER_LOCATION_INFO);
// after a while, there should be the new checkpoints
do {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
} while (checkpointCoordinator.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);
// do the final check
assertEquals(maxConcurrentAttempts, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(3L));
assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(4L));
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTest method testTasksFinishDuringTriggering.
@Test
public void testTasksFinishDuringTriggering() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().setTransitToRunning(false).addJobVertex(jobVertexID1, 1, 256).addJobVertex(jobVertexID2, 1, 256).build();
ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
ExecutionVertex taskVertex = jobVertex1.getTaskVertices()[0];
ExecutionJobVertex jobVertex2 = graph.getJobVertex(jobVertexID2);
ExecutionVertex taskVertex2 = jobVertex2.getTaskVertices()[0];
AtomicBoolean checkpointAborted = new AtomicBoolean(false);
LogicalSlot slot1 = new TestingLogicalSlotBuilder().setTaskManagerGateway(new SimpleAckingTaskManagerGateway() {
@Override
public CompletableFuture<Acknowledge> triggerCheckpoint(ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long timestamp, CheckpointOptions checkpointOptions) {
taskVertex.getCurrentExecutionAttempt().markFinished();
return FutureUtils.completedExceptionally(new RpcException(""));
}
}).createTestingLogicalSlot();
LogicalSlot slot2 = new TestingLogicalSlotBuilder().setTaskManagerGateway(new SimpleAckingTaskManagerGateway() {
@Override
public void notifyCheckpointAborted(ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long latestCompletedCheckpointId, long timestamp) {
checkpointAborted.set(true);
}
}).createTestingLogicalSlot();
ExecutionGraphTestUtils.setVertexResource(taskVertex, slot1);
taskVertex.getCurrentExecutionAttempt().transitionState(ExecutionState.RUNNING);
ExecutionGraphTestUtils.setVertexResource(taskVertex2, slot2);
taskVertex2.getCurrentExecutionAttempt().transitionState(ExecutionState.RUNNING);
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).setAllowCheckpointsAfterTasksFinished(true).build();
// nothing should be happening
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this will not fail because we allow checkpointing even with
// finished tasks
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertTrue(checkpointFuture.isCompletedExceptionally());
assertTrue(checkpointAborted.get());
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTest method testMinCheckpointPause.
@Test
public void testMinCheckpointPause() throws Exception {
// will use a different thread to allow checkpoint triggering before exiting from
// receiveAcknowledgeMessage
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
CheckpointCoordinator coordinator = null;
try {
int pause = 1000;
JobVertexID jobVertexId = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).setMainThreadExecutor(ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(new DirectScheduledExecutorService())).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexId).getTaskVertices()[0];
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
coordinator = new CheckpointCoordinatorBuilder().setTimer(new ScheduledExecutorServiceAdapter(executorService)).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(pause).setCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(1).setMinPauseBetweenCheckpoints(pause).build()).setExecutionGraph(graph).build();
coordinator.startCheckpointScheduler();
coordinator.triggerCheckpoint(// trigger, execute, and later complete by receiveAcknowledgeMessage
true);
coordinator.triggerCheckpoint(// enqueue and later see if it gets executed in the middle of
true);
// receiveAcknowledgeMessage
while (coordinator.getNumberOfPendingCheckpoints() == 0) {
// wait for at least 1 request to be fully processed
Thread.sleep(10);
}
coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptId, 1L), TASK_MANAGER_LOCATION_INFO);
Thread.sleep(pause / 2);
assertEquals(0, coordinator.getNumberOfPendingCheckpoints());
// make sure that the 2nd request is eventually processed
while (coordinator.getNumberOfPendingCheckpoints() == 0) {
Thread.sleep(1);
}
} finally {
if (coordinator != null) {
coordinator.shutdown();
}
executorService.shutdownNow();
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.
/**
* This test triggers a checkpoint and then sends a decline checkpoint message from one of the
* tasks. The expected behaviour is that said checkpoint is discarded and a new checkpoint is
* triggered.
*/
private void testTriggerAndDeclineCheckpointSimple(CheckpointFailureReason checkpointFailureReason) throws Exception {
final CheckpointException checkpointException = new CheckpointException(checkpointFailureReason);
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
TestFailJobCallback failJobCallback = new TestFailJobCallback();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setAlignedCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointFailureManager(new CheckpointFailureManager(0, failJobCallback)).build();
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
// validate that we have a pending checkpoint
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// we have one task scheduled that will cancel after timeout
assertEquals(1, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
assertNotNull(checkpoint);
assertEquals(checkpointId, checkpoint.getCheckpointId());
assertEquals(graph.getJobID(), checkpoint.getJobId());
assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint.getOperatorStates().size());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
CheckpointCoordinatorTestingUtils.TriggeredCheckpoint triggeredCheckpoint = gateway.getOnlyTriggeredCheckpoint(vertex.getCurrentExecutionAttempt().getAttemptId());
assertEquals(checkpointId, triggeredCheckpoint.checkpointId);
assertEquals(checkpoint.getCheckpointTimestamp(), triggeredCheckpoint.timestamp);
assertEquals(CheckpointOptions.forCheckpointWithDefaultLocation(), triggeredCheckpoint.checkpointOptions);
}
// acknowledge from one of the tasks
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// acknowledge the same task again (should not matter)
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// decline checkpoint from the other task, this should cancel the checkpoint
// and trigger a new one
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
assertTrue(checkpoint.isDisposed());
// the canceler is also removed
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// validate that we have no new pending checkpoint
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// decline again, nothing should happen
// decline from the other task, nothing should happen
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID2, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
assertTrue(checkpoint.isDisposed());
assertEquals(1, failJobCallback.getInvokeCounter());
checkpointCoordinator.shutdown();
}
Aggregations