use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.
the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.
/**
* Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
* loses its leadership. Furthermore, it tests that the job manager can recover the job from
* the SubmittedJobGraphStore and checkpoint state is recovered as well.
*/
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
int slots = 2;
ActorRef archive = null;
ActorRef jobManager = null;
ActorRef taskManager = null;
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
MyCheckpointStore checkpointStore = new MyCheckpointStore();
CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
InstanceManager instanceManager = new InstanceManager();
instanceManager.addInstanceListener(scheduler);
archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
jobManager = system.actorOf(jobManagerProps);
ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
Await.ready(tmAlive, deadline.timeLeft());
JobVertex sourceJobVertex = new JobVertex("Source");
sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceJobVertex.setParallelism(slots);
JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
BlockingStatefulInvokable.initializeStaticHelpers(slots);
Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
// tell jobManager that he's the leader
myLeaderElectionService.isLeader(leaderSessionID);
// tell taskManager who's the leader
myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
Await.ready(isLeader, deadline.timeLeft());
Await.ready(isConnectedToJobManager, deadline.timeLeft());
// submit blocking job
Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
Await.ready(jobSubmitted, deadline.timeLeft());
// Wait for some checkpoints to complete
BlockingStatefulInvokable.awaitCompletedCheckpoints();
Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
// Revoke leadership
myLeaderElectionService.notLeader();
// check that the job gets removed from the JobManager
Await.ready(jobRemoved, deadline.timeLeft());
// but stays in the submitted job graph store
assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
// Make JobManager again a leader
myLeaderElectionService.isLeader(newLeaderSessionID);
// tell the TaskManager about it
myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
// wait that the job is recovered and reaches state RUNNING
Await.ready(jobRunning, deadline.timeLeft());
Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
BlockingInvokable.unblock();
// wait til the job has finished
Await.ready(jobFinished, deadline.timeLeft());
// check that the job has been removed from the submitted job graph store
assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
// Check that state has been recovered
long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
for (long state : recoveredStates) {
boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
}
} finally {
if (archive != null) {
archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.
the class SlotProtocolTest method testSlotAvailableRequest.
/**
* Tests whether
* 1) a SlotRequest is routed to the SlotManager
* 2) a SlotRequest is confirmed
* 3) a SlotRequest leads to an allocation of a registered slot
* 4) a SlotRequest is routed to the TaskExecutor
*/
@Test
public void testSlotAvailableRequest() throws Exception {
final String rmAddress = "/rm1";
final String jmAddress = "/jm1";
final String tmAddress = "/tm1";
final JobID jobID = new JobID();
testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
final UUID rmLeaderID = UUID.randomUUID();
final UUID jmLeaderID = UUID.randomUUID();
TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
testRpcService.registerGateway(tmAddress, taskExecutorGateway);
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
ResourceManager<ResourceID> resourceManager = Mockito.spy(new StandaloneResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class)));
resourceManager.start();
rmLeaderElectionService.isLeader(rmLeaderID);
Thread.sleep(1000);
Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
try {
registrationFuture.get(5L, TimeUnit.SECONDS);
} catch (Exception e) {
Assert.fail("JobManager registration Future didn't become ready.");
}
final SlotManager slotManager = slotManagerFactory.slotManager;
final ResourceID resourceID = ResourceID.generate();
final AllocationID allocationID = new AllocationID();
final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
final SlotID slotID = new SlotID(resourceID, 0);
final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
// register slot at SlotManager
slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
// 1) a SlotRequest is routed to the SlotManager
verify(slotManager).requestSlot(slotRequest);
// 2) a SlotRequest is confirmed
Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
// 3) a SlotRequest leads to an allocation of a registered slot
Assert.assertTrue(slotManager.isAllocated(slotID));
Assert.assertTrue(slotManager.isAllocated(allocationID));
// 4) a SlotRequest is routed to the TaskExecutor
verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.
the class SlotProtocolTest method configureHA.
private static TestingLeaderElectionService configureHA(TestingHighAvailabilityServices testingHA, JobID jobID, String rmAddress, UUID rmID, String jmAddress, UUID jmID) {
final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
testingHA.setResourceManagerLeaderElectionService(rmLeaderElectionService);
final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService(rmAddress, rmID);
testingHA.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
final TestingLeaderElectionService jmLeaderElectionService = new TestingLeaderElectionService();
testingHA.setJobMasterLeaderElectionService(jobID, jmLeaderElectionService);
final TestingLeaderRetrievalService jmLeaderRetrievalService = new TestingLeaderRetrievalService(jmAddress, jmID);
testingHA.setJobMasterLeaderRetriever(jobID, jmLeaderRetrievalService);
return rmLeaderElectionService;
}
use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.
the class ResourceManagerJobMasterTest method testRegisterJobMasterWithUnmatchedLeaderSessionId1.
/**
* Test receive registration with unmatched leadershipId from job master
*/
@Test
public void testRegisterJobMasterWithUnmatchedLeaderSessionId1() throws Exception {
String jobMasterAddress = "/jobMasterAddress1";
JobID jobID = mockJobMaster(jobMasterAddress);
TestingLeaderElectionService resourceManagerLeaderElectionService = new TestingLeaderElectionService();
UUID jmLeaderID = UUID.randomUUID();
TestingLeaderRetrievalService jobMasterLeaderRetrievalService = new TestingLeaderRetrievalService(jobMasterAddress, jmLeaderID);
TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final ResourceManager resourceManager = createAndStartResourceManager(resourceManagerLeaderElectionService, jobID, jobMasterLeaderRetrievalService, testingFatalErrorHandler);
final UUID rmLeaderSessionId = grantResourceManagerLeadership(resourceManagerLeaderElectionService);
// test throw exception when receive a registration from job master which takes unmatched leaderSessionId
UUID differentLeaderSessionID = UUID.randomUUID();
Future<RegistrationResponse> unMatchedLeaderFuture = resourceManager.registerJobManager(differentLeaderSessionID, jmLeaderID, jobMasterAddress, jobID);
assertTrue(unMatchedLeaderFuture.get(5, TimeUnit.SECONDS) instanceof RegistrationResponse.Decline);
if (testingFatalErrorHandler.hasExceptionOccurred()) {
testingFatalErrorHandler.rethrowError();
}
}
use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.
the class DispatcherTest method testJobStatusIsShownDuringTermination.
@Test
public void testJobStatusIsShownDuringTermination() throws Exception {
final JobID blockingId = new JobID();
haServices.setJobMasterLeaderElectionService(blockingId, new TestingLeaderElectionService());
final JobManagerRunnerWithBlockingTerminationFactory jobManagerRunnerFactory = new JobManagerRunnerWithBlockingTerminationFactory(blockingId);
dispatcher = createAndStartDispatcher(heartbeatServices, haServices, jobManagerRunnerFactory);
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
final JobGraph blockedJobGraph = JobGraphTestUtils.singleNoOpJobGraph();
blockedJobGraph.setJobID(blockingId);
// Submit two jobs, one blocks forever
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
dispatcherGateway.submitJob(blockedJobGraph, TIMEOUT).get();
// Trigger termination
final CompletableFuture<Void> terminationFuture = dispatcher.closeAsync();
// ensure job eventually transitions to SUSPENDED state
try {
CommonTestUtils.waitUntilCondition(() -> {
JobStatus status = dispatcherGateway.requestExecutionGraphInfo(jobId, TIMEOUT).get().getArchivedExecutionGraph().getState();
return status == JobStatus.SUSPENDED;
}, Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)), 5L);
} finally {
// Unblock the termination of the second job
jobManagerRunnerFactory.unblockTermination();
terminationFuture.get();
}
}
Aggregations