use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.
the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.
/**
* Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
* loses its leadership. Furthermore, it tests that the job manager can recover the job from
* the SubmittedJobGraphStore and checkpoint state is recovered as well.
*/
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
int slots = 2;
ActorRef archive = null;
ActorRef jobManager = null;
ActorRef taskManager = null;
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
MyCheckpointStore checkpointStore = new MyCheckpointStore();
CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
InstanceManager instanceManager = new InstanceManager();
instanceManager.addInstanceListener(scheduler);
archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
jobManager = system.actorOf(jobManagerProps);
ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
Await.ready(tmAlive, deadline.timeLeft());
JobVertex sourceJobVertex = new JobVertex("Source");
sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceJobVertex.setParallelism(slots);
JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
BlockingStatefulInvokable.initializeStaticHelpers(slots);
Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
// tell jobManager that he's the leader
myLeaderElectionService.isLeader(leaderSessionID);
// tell taskManager who's the leader
myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
Await.ready(isLeader, deadline.timeLeft());
Await.ready(isConnectedToJobManager, deadline.timeLeft());
// submit blocking job
Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
Await.ready(jobSubmitted, deadline.timeLeft());
// Wait for some checkpoints to complete
BlockingStatefulInvokable.awaitCompletedCheckpoints();
Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
// Revoke leadership
myLeaderElectionService.notLeader();
// check that the job gets removed from the JobManager
Await.ready(jobRemoved, deadline.timeLeft());
// but stays in the submitted job graph store
assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
// Make JobManager again a leader
myLeaderElectionService.isLeader(newLeaderSessionID);
// tell the TaskManager about it
myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
// wait that the job is recovered and reaches state RUNNING
Await.ready(jobRunning, deadline.timeLeft());
Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
BlockingInvokable.unblock();
// wait til the job has finished
Await.ready(jobFinished, deadline.timeLeft());
// check that the job has been removed from the submitted job graph store
assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
// Check that state has been recovered
long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
for (long state : recoveredStates) {
boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
}
} finally {
if (archive != null) {
archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.
the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.
/**
* Visible for re-use in {@link
* org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
*/
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
final CountDownLatch cleanerClosed = new CountDownLatch(1);
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {
@Override
public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus, checkpointsCleaner);
}
};
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {
@Override
public void shutdown(JobStatus jobStatus) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus);
}
};
final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {
@Override
public synchronized CompletableFuture<Void> closeAsync() {
cleanerClosed.countDown();
return super.closeAsync();
}
};
final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
final CountDownLatch schedulerClosing = new CountDownLatch(1);
executorService.submit(() -> {
scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
schedulerClosing.countDown();
});
// Wait for scheduler to start closing.
schedulerClosing.await();
assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
checkpointServicesShutdownBlocked.countDown();
cleanerClosed.await();
schedulerClosed.get();
}
use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.
the class AdaptiveSchedulerTest method testExceptionHistoryWithTaskFailureFromStopWithSavepoint.
@Test
public void testExceptionHistoryWithTaskFailureFromStopWithSavepoint() throws Exception {
final Exception expectedException = new Exception("Expected Local Exception");
Consumer<JobGraph> setupJobGraph = jobGraph -> jobGraph.setSnapshotSettings(new JobCheckpointingSettings(CheckpointCoordinatorConfiguration.builder().build(), null));
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
Consumer<AdaptiveSchedulerBuilder> setupScheduler = builder -> builder.setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner);
BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic = (scheduler, attemptIds) -> {
final ExecutionAttemptID attemptId = attemptIds.get(1);
scheduler.stopWithSavepoint("file:///tmp/target", true, SavepointFormatType.CANONICAL);
scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.FAILED, expectedException)));
};
final Iterable<RootExceptionHistoryEntry> actualExceptionHistory = runExceptionHistoryTests(testLogic, setupScheduler, setupJobGraph);
assertThat(actualExceptionHistory).hasSize(1);
final RootExceptionHistoryEntry failure = actualExceptionHistory.iterator().next();
assertThat(failure.getException().deserializeError(classLoader)).isEqualTo(expectedException);
}
use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.
the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointFails.
@Test
public void testRestoringModifiedJobFromSavepointFails() throws Exception {
final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(false, 42L);
final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
try {
executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, new StandaloneCompletedCheckpointStore(1), new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
}, log);
fail("Expected ExecutionGraph creation to fail because of non restored state.");
} catch (Exception e) {
assertThat(e, FlinkMatchers.containsMessage("Failed to rollback to checkpoint/savepoint"));
}
}
use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.
the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds.
@Test
public void testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds() throws Exception {
// create savepoint data
final long savepointId = 42L;
final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(true, savepointId);
final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, completedCheckpointStore, new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
}, log);
final CompletedCheckpoint savepoint = completedCheckpointStore.getLatestCheckpoint();
MatcherAssert.assertThat(savepoint, notNullValue());
MatcherAssert.assertThat(savepoint.getCheckpointID(), Matchers.is(savepointId));
}
Aggregations