use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testCheckpointTimeoutIsolated.
@Test
public void testCheckpointTimeoutIsolated() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
// trigger a checkpoint, partially acknowledged
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
assertFalse(checkpoint.isDisposed());
OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
// triggers cancelling
manuallyTriggeredScheduledExecutor.triggerScheduledTasks();
assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDisposed());
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// validate that the received states have been discarded
verify(subtaskState1, times(1)).discardState();
// no confirm message must have been sent
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(0, gateway.getNotifiedCompletedCheckpoints(attemptId).size());
}
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testMaxConcurrentAttempts.
private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
try {
JobVertexID jobVertexID1 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
10).setCheckpointTimeout(// timeout is very long (200 s)
200000).setMinPauseBetweenCheckpoints(// no extra delay
0L).setMaxConcurrentCheckpoints(maxConcurrentAttempts).build();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
for (int i = 0; i < maxConcurrentAttempts; i++) {
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
}
assertEquals(maxConcurrentAttempts, gateway.getTriggeredCheckpoints(attemptID1).size());
assertEquals(0, gateway.getNotifiedCompletedCheckpoints(attemptID1).size());
// now, once we acknowledge one checkpoint, it should trigger the next one
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 1L), TASK_MANAGER_LOCATION_INFO);
final Collection<ScheduledFuture<?>> periodicScheduledTasks = manuallyTriggeredScheduledExecutor.getActivePeriodicScheduledTask();
assertEquals(1, periodicScheduledTasks.size());
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(maxConcurrentAttempts + 1, gateway.getTriggeredCheckpoints(attemptID1).size());
// no further checkpoints should happen
manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(maxConcurrentAttempts + 1, gateway.getTriggeredCheckpoints(attemptID1).size());
checkpointCoordinator.shutdown();
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testHandleMessagesForNonExistingCheckpoints.
@Test
public void testHandleMessagesForNonExistingCheckpoints() throws Exception {
// create some mock execution vertices and trigger some checkpoint
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();
// send some messages that do not belong to either the job or the any
// of the vertices that need to be acknowledged.
// non of the messages should throw an exception
// wrong job id
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
// unknown checkpoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 1L), TASK_MANAGER_LOCATION_INFO);
// unknown ack vertex
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testMultipleConcurrentCheckpoints.
@Test
public void testMultipleConcurrentCheckpoints() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
JobVertexID jobVertexID3 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).addJobVertex(jobVertexID3, false).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionVertex vertex3 = graph.getJobVertex(jobVertexID3).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID3 = vertex3.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending1 = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
long checkpointId1 = pending1.getCheckpointId();
// trigger messages should have been sent
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId1, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
// acknowledge one of the three tasks
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId1), TASK_MANAGER_LOCATION_INFO);
// start the second checkpoint
// trigger the first checkpoint. this should succeed
gateway.resetCount();
final CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
PendingCheckpoint pending2;
{
Iterator<PendingCheckpoint> all = checkpointCoordinator.getPendingCheckpoints().values().iterator();
PendingCheckpoint cc1 = all.next();
PendingCheckpoint cc2 = all.next();
pending2 = pending1 == cc1 ? cc2 : cc1;
}
long checkpointId2 = pending2.getCheckpointId();
// trigger messages should have been sent
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId2, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
// we acknowledge the remaining two tasks from the first
// checkpoint and two tasks from the second checkpoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId1), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId1), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
// now, the first checkpoint should be confirmed
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertTrue(pending1.isDisposed());
// the first confirm message should be out
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId1, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
}
// send the last remaining ack for the second checkpoint
gateway.resetCount();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId2), TASK_MANAGER_LOCATION_INFO);
// now, the second checkpoint should be confirmed
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(2, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertTrue(pending2.isDisposed());
// the second commit message should be out
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId2, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
}
// validate the committed checkpoints
List<CompletedCheckpoint> scs = checkpointCoordinator.getSuccessfulCheckpoints();
CompletedCheckpoint sc1 = scs.get(0);
assertEquals(checkpointId1, sc1.getCheckpointID());
assertEquals(graph.getJobID(), sc1.getJobId());
assertEquals(3, sc1.getOperatorStates().size());
assertTrue(sc1.getOperatorStates().values().stream().allMatch(this::hasNoSubState));
CompletedCheckpoint sc2 = scs.get(1);
assertEquals(checkpointId2, sc2.getCheckpointID());
assertEquals(graph.getJobID(), sc2.getJobId());
assertEquals(3, sc2.getOperatorStates().size());
assertTrue(sc2.getOperatorStates().values().stream().allMatch(this::hasNoSubState));
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointComplex.
/**
* This test triggers two checkpoints and then sends a decline message from one of the tasks for
* the first checkpoint. This should discard the first checkpoint while not triggering a new
* checkpoint because a later checkpoint is already in progress.
*/
@Test
public void testTriggerAndDeclineCheckpointComplex() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph);
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
// trigger second checkpoint, should also succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
// validate that we have a pending checkpoint
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(2, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
Iterator<Map.Entry<Long, PendingCheckpoint>> it = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator();
long checkpoint1Id = it.next().getKey();
long checkpoint2Id = it.next().getKey();
PendingCheckpoint checkpoint1 = checkpointCoordinator.getPendingCheckpoints().get(checkpoint1Id);
PendingCheckpoint checkpoint2 = checkpointCoordinator.getPendingCheckpoints().get(checkpoint2Id);
assertNotNull(checkpoint1);
assertEquals(checkpoint1Id, checkpoint1.getCheckpointId());
assertEquals(graph.getJobID(), checkpoint1.getJobId());
assertEquals(2, checkpoint1.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint1.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint1.getOperatorStates().size());
assertFalse(checkpoint1.isDisposed());
assertFalse(checkpoint1.areTasksFullyAcknowledged());
assertNotNull(checkpoint2);
assertEquals(checkpoint2Id, checkpoint2.getCheckpointId());
assertEquals(graph.getJobID(), checkpoint2.getJobId());
assertEquals(2, checkpoint2.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint2.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint2.getOperatorStates().size());
assertFalse(checkpoint2.isDisposed());
assertFalse(checkpoint2.areTasksFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
List<CheckpointCoordinatorTestingUtils.TriggeredCheckpoint> triggeredCheckpoints = gateway.getTriggeredCheckpoints(vertex.getCurrentExecutionAttempt().getAttemptId());
assertEquals(2, triggeredCheckpoints.size());
assertEquals(checkpoint1Id, triggeredCheckpoints.get(0).checkpointId);
assertEquals(checkpoint2Id, triggeredCheckpoints.get(1).checkpointId);
}
// decline checkpoint from one of the tasks, this should cancel the checkpoint
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpoint1Id, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
assertEquals(checkpoint1Id, gateway.getOnlyNotifiedAbortedCheckpoint(vertex.getCurrentExecutionAttempt().getAttemptId()).checkpointId);
}
assertTrue(checkpoint1.isDisposed());
// validate that we have only one pending checkpoint left
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(1, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// validate that it is the same second checkpoint from earlier
long checkpointIdNew = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpointNew = checkpointCoordinator.getPendingCheckpoints().get(checkpointIdNew);
assertEquals(checkpoint2Id, checkpointIdNew);
assertNotNull(checkpointNew);
assertEquals(checkpointIdNew, checkpointNew.getCheckpointId());
assertEquals(graph.getJobID(), checkpointNew.getJobId());
assertEquals(2, checkpointNew.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpointNew.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpointNew.getOperatorStates().size());
assertFalse(checkpointNew.isDisposed());
assertFalse(checkpointNew.areTasksFullyAcknowledged());
assertNotEquals(checkpoint1.getCheckpointId(), checkpointNew.getCheckpointId());
// decline again, nothing should happen
// decline from the other task, nothing should happen
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpoint1Id, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID2, checkpoint1Id, new CheckpointException(CHECKPOINT_DECLINED)), TASK_MANAGER_LOCATION_INFO);
assertTrue(checkpoint1.isDisposed());
// will not notify abort message again
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
assertEquals(1, gateway.getNotifiedAbortedCheckpoints(vertex.getCurrentExecutionAttempt().getAttemptId()).size());
}
checkpointCoordinator.shutdown();
}
Aggregations