use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.
/**
* Triggers a savepoint and two checkpoints. The second checkpoint completes and subsumes the
* first checkpoint, but not the first savepoint. Then we trigger another checkpoint and
* savepoint. The 2nd savepoint completes and subsumes the last checkpoint, but not the first
* savepoint.
*/
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
JobVertexID jobVertexID1 = new JobVertexID();
JobVertexID jobVertexID2 = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = spy(new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCheckpointIDCounter(counter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(1)).setTimer(manuallyTriggeredScheduledExecutor).build());
String savepointDir = tmpFolder.newFolder().getAbsolutePath();
// Trigger savepoint and checkpoint
CompletableFuture<CompletedCheckpoint> savepointFuture1 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long savepointId1 = counter.getLast();
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
long checkpointId2 = counter.getLast();
assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
// no completed checkpoint before checkpointId2.
verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId2), anyLong(), eq(INVALID_CHECKPOINT_ID));
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
assertFalse(savepointFuture1.isDone());
CompletableFuture<CompletedCheckpoint> checkpointFuture3 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture3);
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
CompletableFuture<CompletedCheckpoint> savepointFuture2 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long savepointId2 = counter.getLast();
FutureUtils.throwIfCompletedExceptionally(savepointFuture2);
assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
// savepoints should not subsume checkpoints
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);
// we do not send notify checkpoint complete for savepoints
verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId2), anyLong(), anyLong());
assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
assertFalse(savepointFuture1.isDone());
assertNotNull(savepointFuture2.get());
// Ack first savepoint
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);
// we do not send notify checkpoint complete for savepoints
verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId1), anyLong(), anyLong());
assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
assertNotNull(savepointFuture1.get());
CompletableFuture<CompletedCheckpoint> checkpointFuture4 = checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
FutureUtils.throwIfCompletedExceptionally(checkpointFuture4);
long checkpointId4 = counter.getLast();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId4), TASK_MANAGER_LOCATION_INFO);
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId4), TASK_MANAGER_LOCATION_INFO);
// checkpoint2 would be subsumed.
verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId4), anyLong(), eq(checkpointId2));
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggeringFullCheckpoints.
@Test
public void testTriggeringFullCheckpoints() throws Exception {
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// create a savepoint, we can restore from later
final CompletedCheckpoint savepoint = takeSavepoint(graph, attemptID);
// restore from a savepoint in NO_CLAIM mode
final StandaloneCompletedCheckpointStore checkpointStore = new StandaloneCompletedCheckpointStore(1);
final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
checkpointCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepoint.getExternalPointer(), true, RestoreMode.NO_CLAIM), graph.getAllVertices(), this.getClass().getClassLoader());
// trigger a savepoint before any checkpoint completes
// next triggered checkpoint should still be a full one
takeSavepoint(graph, attemptID, checkpointCoordinator, 2);
checkpointCoordinator.startCheckpointScheduler();
gateway.resetCount();
// the checkpoint should be a FULL_CHECKPOINT
final CompletableFuture<CompletedCheckpoint> checkpoint = checkpointCoordinator.triggerCheckpoint(true);
manuallyTriggeredScheduledExecutor.triggerAll();
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 3), TASK_MANAGER_LOCATION_INFO);
checkpoint.get();
assertThat(gateway.getOnlyTriggeredCheckpoint(attemptID).checkpointOptions.getCheckpointType(), is(CheckpointType.FULL_CHECKPOINT));
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointSnapshotMasterHookFailed.
@Test
public void testTriggerCheckpointSnapshotMasterHookFailed() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator();
final CompletableFuture<String> masterHookCheckpointFuture = new CompletableFuture<>();
checkpointCoordinator.addMasterHook(new TestingMasterHook(masterHookCheckpointFuture));
checkpointCoordinator.startCheckpointScheduler();
final CompletableFuture<CompletedCheckpoint> onCompletionPromise = triggerPeriodicCheckpoint(checkpointCoordinator);
// checkpoint trigger will not finish since master hook checkpoint is not finished yet
manuallyTriggeredScheduledExecutor.triggerAll();
assertTrue(checkpointCoordinator.isTriggering());
// continue triggering
masterHookCheckpointFuture.completeExceptionally(new Exception("by design"));
manuallyTriggeredScheduledExecutor.triggerAll();
assertFalse(checkpointCoordinator.isTriggering());
try {
onCompletionPromise.get();
fail("Should not reach here");
} catch (ExecutionException e) {
final Optional<CheckpointException> checkpointExceptionOptional = ExceptionUtils.findThrowable(e, CheckpointException.class);
assertTrue(checkpointExceptionOptional.isPresent());
assertEquals(CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, checkpointExceptionOptional.get().getCheckpointFailureReason());
}
// it doesn't really trigger task manager to do checkpoint
assertEquals(0, gateway.getTriggeredCheckpoints(attemptID).size());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointRequestQueuedWithFailure.
@Test
public void testTriggerCheckpointRequestQueuedWithFailure() throws Exception {
JobVertexID jobVertexID = new JobVertexID();
CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
// set up the coordinator and validate the initial state
CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(new UnstableCheckpointIDCounter(id -> id == 0)).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.startCheckpointScheduler();
// start a periodic checkpoint first
final CompletableFuture<CompletedCheckpoint> onCompletionPromise1 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
assertTrue(checkpointCoordinator.isTriggering());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
// another trigger before the prior one finished
final CompletableFuture<CompletedCheckpoint> onCompletionPromise2 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
// another trigger before the first one finished
final CompletableFuture<CompletedCheckpoint> onCompletionPromise3 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
assertTrue(checkpointCoordinator.isTriggering());
assertEquals(2, checkpointCoordinator.getTriggerRequestQueue().size());
manuallyTriggeredScheduledExecutor.triggerAll();
// the first triggered checkpoint fails by design through UnstableCheckpointIDCounter
assertTrue(onCompletionPromise1.isCompletedExceptionally());
assertFalse(onCompletionPromise2.isCompletedExceptionally());
assertFalse(onCompletionPromise3.isCompletedExceptionally());
assertFalse(checkpointCoordinator.isTriggering());
assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
assertEquals(2, gateway.getTriggeredCheckpoints(attemptID).size());
}
use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.
the class FileCache method createTmpFile.
// ------------------------------------------------------------------------
/**
* If the file doesn't exists locally, retrieve the file from the blob-service.
*
* @param entry The cache entry descriptor (path, executable flag)
* @param jobID The ID of the job for which the file is copied.
* @return The handle to the task that copies the file.
*/
public Future<Path> createTmpFile(String name, DistributedCacheEntry entry, JobID jobID, ExecutionAttemptID executionId) throws Exception {
synchronized (lock) {
Map<String, Future<Path>> jobEntries = entries.computeIfAbsent(jobID, k -> new HashMap<>());
// register reference holder
final Set<ExecutionAttemptID> refHolders = jobRefHolders.computeIfAbsent(jobID, id -> new HashSet<>());
refHolders.add(executionId);
Future<Path> fileEntry = jobEntries.get(name);
if (fileEntry != null) {
// immediately returns the file
return fileEntry;
} else {
// need to copy the file
// create the target path
File tempDirToUse = new File(storageDirectories[nextDirectory++], jobID.toString());
if (nextDirectory >= storageDirectories.length) {
nextDirectory = 0;
}
// kick off the copying
Callable<Path> cp;
if (entry.blobKey != null) {
cp = new CopyFromBlobProcess(entry, jobID, blobService, new Path(tempDirToUse.getAbsolutePath()));
} else {
cp = new CopyFromDFSProcess(entry, new Path(tempDirToUse.getAbsolutePath()));
}
FutureTask<Path> copyTask = new FutureTask<>(cp);
executorService.submit(copyTask);
// store our entry
jobEntries.put(name, copyTask);
return copyTask;
}
}
}
Aggregations