use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testMinCheckpointPause.
@Test
public void testMinCheckpointPause() throws Exception {
// will use a different thread to allow checkpoint triggering before exiting from
// receiveAcknowledgeMessage
ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
CheckpointCoordinator coordinator = null;
try {
int pause = 1000;
JobVertexID jobVertexId = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).setMainThreadExecutor(ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(new DirectScheduledExecutorService())).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexId).getTaskVertices()[0];
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
coordinator = new CheckpointCoordinatorBuilder().setTimer(new ScheduledExecutorServiceAdapter(executorService)).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(pause).setCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(1).setMinPauseBetweenCheckpoints(pause).build()).setExecutionGraph(graph).build();
coordinator.startCheckpointScheduler();
coordinator.triggerCheckpoint(// trigger, execute, and later complete by receiveAcknowledgeMessage
true);
coordinator.triggerCheckpoint(// enqueue and later see if it gets executed in the middle of
true);
// receiveAcknowledgeMessage
while (coordinator.getNumberOfPendingCheckpoints() == 0) {
// wait for at least 1 request to be fully processed
Thread.sleep(10);
}
coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptId, 1L), TASK_MANAGER_LOCATION_INFO);
Thread.sleep(pause / 2);
assertEquals(0, coordinator.getNumberOfPendingCheckpoints());
// make sure that the 2nd request is eventually processed
while (coordinator.getNumberOfPendingCheckpoints() == 0) {
Thread.sleep(1);
}
} finally {
if (coordinator != null) {
coordinator.shutdown();
}
executorService.shutdownNow();
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.
/**
* This test triggers a checkpoint and then sends a decline checkpoint message from one of the
* tasks. The expected behaviour is that said checkpoint is discarded and a new checkpoint is
* triggered.
*/
private void testTriggerAndDeclineCheckpointSimple(CheckpointFailureReason checkpointFailureReason) throws Exception {
final CheckpointException checkpointException = new CheckpointException(checkpointFailureReason);
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
TestFailJobCallback failJobCallback = new TestFailJobCallback();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setAlignedCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointFailureManager(new CheckpointFailureManager(0, failJobCallback)).build();
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
// validate that we have a pending checkpoint
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// we have one task scheduled that will cancel after timeout
assertEquals(1, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
assertNotNull(checkpoint);
assertEquals(checkpointId, checkpoint.getCheckpointId());
assertEquals(graph.getJobID(), checkpoint.getJobId());
assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint.getOperatorStates().size());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
CheckpointCoordinatorTestingUtils.TriggeredCheckpoint triggeredCheckpoint = gateway.getOnlyTriggeredCheckpoint(vertex.getCurrentExecutionAttempt().getAttemptId());
assertEquals(checkpointId, triggeredCheckpoint.checkpointId);
assertEquals(checkpoint.getCheckpointTimestamp(), triggeredCheckpoint.timestamp);
assertEquals(CheckpointOptions.forCheckpointWithDefaultLocation(), triggeredCheckpoint.checkpointOptions);
}
// acknowledge from one of the tasks
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// acknowledge the same task again (should not matter)
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
assertFalse(checkpoint.isDisposed());
assertFalse(checkpoint.areTasksFullyAcknowledged());
// decline checkpoint from the other task, this should cancel the checkpoint
// and trigger a new one
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
assertTrue(checkpoint.isDisposed());
// the canceler is also removed
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// validate that we have no new pending checkpoint
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// decline again, nothing should happen
// decline from the other task, nothing should happen
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID2, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
assertTrue(checkpoint.isDisposed());
assertEquals(1, failJobCallback.getInvokeCounter());
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndConfirmSimpleSavepoint.
@Test
public void testTriggerAndConfirmSimpleSavepoint() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph);
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
CompletableFuture<CompletedCheckpoint> savepointFuture = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
assertFalse(savepointFuture.isDone());
// validate that we have a pending savepoint
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint pending = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
assertNotNull(pending);
assertEquals(checkpointId, pending.getCheckpointId());
assertEquals(graph.getJobID(), pending.getJobId());
assertEquals(2, pending.getNumberOfNonAcknowledgedTasks());
assertEquals(0, pending.getNumberOfAcknowledgedTasks());
assertEquals(0, pending.getOperatorStates().size());
assertFalse(pending.isDisposed());
assertFalse(pending.areTasksFullyAcknowledged());
assertFalse(pending.canBeSubsumed());
OperatorID opID1 = OperatorID.fromJobVertexID(vertex1.getJobvertexId());
OperatorID opID2 = OperatorID.fromJobVertexID(vertex2.getJobvertexId());
OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
OperatorSubtaskState subtaskState2 = mock(OperatorSubtaskState.class);
TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot(singletonMap(opID1, subtaskState1));
TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot(singletonMap(opID2, subtaskState2));
// acknowledge from one of the tasks
AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
assertEquals(1, pending.getNumberOfAcknowledgedTasks());
assertEquals(1, pending.getNumberOfNonAcknowledgedTasks());
assertFalse(pending.isDisposed());
assertFalse(pending.areTasksFullyAcknowledged());
assertFalse(savepointFuture.isDone());
// acknowledge the same task again (should not matter)
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
assertFalse(pending.isDisposed());
assertFalse(pending.areTasksFullyAcknowledged());
assertFalse(savepointFuture.isDone());
// acknowledge the other task.
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
// the checkpoint is internally converted to a successful checkpoint and the
// pending checkpoint object is disposed
assertTrue(pending.isDisposed());
assertNotNull(savepointFuture.get());
// the now we should have a completed checkpoint
// savepoints should not registered as retained checkpoints
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
// validate that the relevant tasks got a confirmation message
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
assertThat(gateway.getNotifiedCompletedCheckpoints(attemptId)).isEmpty();
}
CompletedCheckpoint success = savepointFuture.get();
assertEquals(graph.getJobID(), success.getJobId());
assertEquals(pending.getCheckpointId(), success.getCheckpointID());
assertEquals(2, success.getOperatorStates().size());
checkpointCoordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method jobFailsIfInFlightSynchronousSavepointIsDiscarded.
@Test
public void jobFailsIfInFlightSynchronousSavepointIsDiscarded() throws Exception {
final Tuple2<Integer, Throwable> invocationCounterAndException = Tuple2.of(0, null);
final Throwable expectedRootCause = new IOException("Custom-Exception");
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
final CheckpointCoordinator coordinator = getCheckpointCoordinator(graph, new CheckpointFailureManager(0, new CheckpointFailureManager.FailJobCallback() {
@Override
public void failJob(Throwable cause) {
invocationCounterAndException.f0 += 1;
invocationCounterAndException.f1 = cause;
}
@Override
public void failJobDueToTaskFailure(Throwable cause, ExecutionAttemptID failingTask) {
throw new AssertionError("This method should not be called for the test.");
}
}));
final CompletableFuture<CompletedCheckpoint> savepointFuture = coordinator.triggerSynchronousSavepoint(false, "test-dir", SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
final PendingCheckpoint syncSavepoint = declineSynchronousSavepoint(graph.getJobID(), coordinator, attemptID1, expectedRootCause);
assertTrue(syncSavepoint.isDisposed());
try {
savepointFuture.get();
fail("Expected Exception not found.");
} catch (ExecutionException e) {
final Throwable cause = ExceptionUtils.stripExecutionException(e);
assertTrue(cause instanceof CheckpointException);
assertEquals(expectedRootCause.getMessage(), cause.getCause().getCause().getMessage());
}
assertEquals(1L, invocationCounterAndException.f0.intValue());
assertTrue(invocationCounterAndException.f1 instanceof CheckpointException && invocationCounterAndException.f1.getCause().getCause().getMessage().equals(expectedRootCause.getMessage()));
coordinator.shutdown();
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testExternallyInducedSourceWithOperatorCoordinator.
/**
* Test that the checkpoint still behave correctly when the task checkpoint is triggered by the
* master hooks and finished before the master checkpoint. Also make sure that the operator
* coordinators are checkpointed before starting the task checkpoint.
*/
@Test
public void testExternallyInducedSourceWithOperatorCoordinator() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID2, subtaskState2);
// Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
// immediately.
AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
coordCheckpointDone.set(true);
result.complete(new byte[0]);
}).setOperatorID(opID1).build();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).build();
AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
// Add a master hook which triggers and acks the task checkpoint immediately.
// In this case the task checkpoints would complete before the job master checkpoint
// completes.
checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {
@Override
public String getIdentifier() {
return "anything";
}
@Override
@Nullable
public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
// Acknowledge the checkpoint in the master hooks so the task snapshots
// complete before
// the master state snapshot completes.
checkpointIdRef.set(checkpointId);
AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
return null;
}
@Override
public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
}
@Override
public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
return new SimpleVersionedSerializer<Integer>() {
@Override
public int getVersion() {
return 0;
}
@Override
public byte[] serialize(Integer obj) throws IOException {
return new byte[0];
}
@Override
public Integer deserialize(int version, byte[] serialized) throws IOException {
return 1;
}
};
}
});
// Verify initial state.
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// trigger the first checkpoint. this should succeed
final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
// now we should have a completed checkpoint
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
// the canceler should be removed now
assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
// validate that the relevant tasks got a confirmation message
long checkpointId = checkpointIdRef.get();
for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
}
CompletedCheckpoint success = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
assertEquals(graph.getJobID(), success.getJobId());
assertEquals(2, success.getOperatorStates().size());
checkpointCoordinator.shutdown();
}
Aggregations