Search in sources :

Example 1 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.

/**
	 * Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
	 * loses its leadership. Furthermore, it tests that the job manager can recover the job from
	 * the SubmittedJobGraphStore and checkpoint state is recovered as well.
	 */
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    int slots = 2;
    ActorRef archive = null;
    ActorRef jobManager = null;
    ActorRef taskManager = null;
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
    flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
        MyCheckpointStore checkpointStore = new MyCheckpointStore();
        CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
        CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
        TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
        InstanceManager instanceManager = new InstanceManager();
        instanceManager.addInstanceListener(scheduler);
        archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
        Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
        jobManager = system.actorOf(jobManagerProps);
        ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
        taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
        ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
        Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
        Await.ready(tmAlive, deadline.timeLeft());
        JobVertex sourceJobVertex = new JobVertex("Source");
        sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceJobVertex.setParallelism(slots);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
        List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
        jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
        BlockingStatefulInvokable.initializeStaticHelpers(slots);
        Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
        Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
        // tell jobManager that he's the leader
        myLeaderElectionService.isLeader(leaderSessionID);
        // tell taskManager who's the leader
        myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
        Await.ready(isLeader, deadline.timeLeft());
        Await.ready(isConnectedToJobManager, deadline.timeLeft());
        // submit blocking job
        Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
        Await.ready(jobSubmitted, deadline.timeLeft());
        // Wait for some checkpoints to complete
        BlockingStatefulInvokable.awaitCompletedCheckpoints();
        Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        // Revoke leadership
        myLeaderElectionService.notLeader();
        // check that the job gets removed from the JobManager
        Await.ready(jobRemoved, deadline.timeLeft());
        // but stays in the submitted job graph store
        assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
        // Make JobManager again a leader
        myLeaderElectionService.isLeader(newLeaderSessionID);
        // tell the TaskManager about it
        myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
        // wait that the job is recovered and reaches state RUNNING
        Await.ready(jobRunning, deadline.timeLeft());
        Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        BlockingInvokable.unblock();
        // wait til the job has finished
        Await.ready(jobFinished, deadline.timeLeft());
        // check that the job has been removed from the submitted job graph store
        assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        // Check that state has been recovered
        long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
        for (long state : recoveredStates) {
            boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
            assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
        }
    } finally {
        if (archive != null) {
            archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) ActorRef(akka.actor.ActorRef) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Props(akka.actor.Props) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TestingTaskManager(org.apache.flink.runtime.testingUtils.TestingTaskManager) BlobServer(org.apache.flink.runtime.blob.BlobServer) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) UUID(java.util.UUID) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Deadline(scala.concurrent.duration.Deadline) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 2 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class SlotProtocolTest method testSlotAvailableRequest.

/**
	 * Tests whether
	 * 1) a SlotRequest is routed to the SlotManager
	 * 2) a SlotRequest is confirmed
	 * 3) a SlotRequest leads to an allocation of a registered slot
	 * 4) a SlotRequest is routed to the TaskExecutor
	 */
@Test
public void testSlotAvailableRequest() throws Exception {
    final String rmAddress = "/rm1";
    final String jmAddress = "/jm1";
    final String tmAddress = "/tm1";
    final JobID jobID = new JobID();
    testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
    final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
    final UUID rmLeaderID = UUID.randomUUID();
    final UUID jmLeaderID = UUID.randomUUID();
    TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
    TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
    Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
    testRpcService.registerGateway(tmAddress, taskExecutorGateway);
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
    ResourceManager<ResourceID> resourceManager = Mockito.spy(new StandaloneResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class)));
    resourceManager.start();
    rmLeaderElectionService.isLeader(rmLeaderID);
    Thread.sleep(1000);
    Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
    try {
        registrationFuture.get(5L, TimeUnit.SECONDS);
    } catch (Exception e) {
        Assert.fail("JobManager registration Future didn't become ready.");
    }
    final SlotManager slotManager = slotManagerFactory.slotManager;
    final ResourceID resourceID = ResourceID.generate();
    final AllocationID allocationID = new AllocationID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
    final SlotID slotID = new SlotID(resourceID, 0);
    final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
    final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
    // register slot at SlotManager
    slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
    SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
    RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
    // 1) a SlotRequest is routed to the SlotManager
    verify(slotManager).requestSlot(slotRequest);
    // 2) a SlotRequest is confirmed
    Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
    // 3) a SlotRequest leads to an allocation of a registered slot
    Assert.assertTrue(slotManager.isAllocated(slotID));
    Assert.assertTrue(slotManager.isAllocated(allocationID));
    // 4) a SlotRequest is routed to the TaskExecutor
    verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TaskExecutorRegistration(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorRegistration) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) RMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestReply) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingSlotManager(org.apache.flink.runtime.resourcemanager.TestingSlotManager) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class SlotProtocolTest method configureHA.

private static TestingLeaderElectionService configureHA(TestingHighAvailabilityServices testingHA, JobID jobID, String rmAddress, UUID rmID, String jmAddress, UUID jmID) {
    final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
    testingHA.setResourceManagerLeaderElectionService(rmLeaderElectionService);
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService(rmAddress, rmID);
    testingHA.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    final TestingLeaderElectionService jmLeaderElectionService = new TestingLeaderElectionService();
    testingHA.setJobMasterLeaderElectionService(jobID, jmLeaderElectionService);
    final TestingLeaderRetrievalService jmLeaderRetrievalService = new TestingLeaderRetrievalService(jmAddress, jmID);
    testingHA.setJobMasterLeaderRetriever(jobID, jmLeaderRetrievalService);
    return rmLeaderElectionService;
}
Also used : TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)

Example 4 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class ResourceManagerJobMasterTest method testRegisterJobMasterWithUnmatchedLeaderSessionId1.

/**
	 * Test receive registration with unmatched leadershipId from job master
	 */
@Test
public void testRegisterJobMasterWithUnmatchedLeaderSessionId1() throws Exception {
    String jobMasterAddress = "/jobMasterAddress1";
    JobID jobID = mockJobMaster(jobMasterAddress);
    TestingLeaderElectionService resourceManagerLeaderElectionService = new TestingLeaderElectionService();
    UUID jmLeaderID = UUID.randomUUID();
    TestingLeaderRetrievalService jobMasterLeaderRetrievalService = new TestingLeaderRetrievalService(jobMasterAddress, jmLeaderID);
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final ResourceManager resourceManager = createAndStartResourceManager(resourceManagerLeaderElectionService, jobID, jobMasterLeaderRetrievalService, testingFatalErrorHandler);
    final UUID rmLeaderSessionId = grantResourceManagerLeadership(resourceManagerLeaderElectionService);
    // test throw exception when receive a registration from job master which takes unmatched leaderSessionId
    UUID differentLeaderSessionID = UUID.randomUUID();
    Future<RegistrationResponse> unMatchedLeaderFuture = resourceManager.registerJobManager(differentLeaderSessionID, jmLeaderID, jobMasterAddress, jobID);
    assertTrue(unMatchedLeaderFuture.get(5, TimeUnit.SECONDS) instanceof RegistrationResponse.Decline);
    if (testingFatalErrorHandler.hasExceptionOccurred()) {
        testingFatalErrorHandler.rethrowError();
    }
}
Also used : TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 5 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class DispatcherTest method testJobStatusIsShownDuringTermination.

@Test
public void testJobStatusIsShownDuringTermination() throws Exception {
    final JobID blockingId = new JobID();
    haServices.setJobMasterLeaderElectionService(blockingId, new TestingLeaderElectionService());
    final JobManagerRunnerWithBlockingTerminationFactory jobManagerRunnerFactory = new JobManagerRunnerWithBlockingTerminationFactory(blockingId);
    dispatcher = createAndStartDispatcher(heartbeatServices, haServices, jobManagerRunnerFactory);
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    final JobGraph blockedJobGraph = JobGraphTestUtils.singleNoOpJobGraph();
    blockedJobGraph.setJobID(blockingId);
    // Submit two jobs, one blocks forever
    dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
    dispatcherGateway.submitJob(blockedJobGraph, TIMEOUT).get();
    // Trigger termination
    final CompletableFuture<Void> terminationFuture = dispatcher.closeAsync();
    // ensure job eventually transitions to SUSPENDED state
    try {
        CommonTestUtils.waitUntilCondition(() -> {
            JobStatus status = dispatcherGateway.requestExecutionGraphInfo(jobId, TIMEOUT).get().getArchivedExecutionGraph().getState();
            return status == JobStatus.SUSPENDED;
        }, Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)), 5L);
    } finally {
        // Unblock the termination of the second job
        jobManagerRunnerFactory.unblockTermination();
        terminationFuture.get();
    }
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

TestingLeaderElectionService (org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)25 Test (org.junit.Test)13 UUID (java.util.UUID)10 JobID (org.apache.flink.api.common.JobID)9 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)8 Before (org.junit.Before)8 Configuration (org.apache.flink.configuration.Configuration)6 CompletableFuture (java.util.concurrent.CompletableFuture)5 Time (org.apache.flink.api.common.time.Time)5 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)5 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 EmbeddedJobResultStore (org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore)4 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)4 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)3 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)3 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 JobGraphStore (org.apache.flink.runtime.jobmanager.JobGraphStore)3