Search in sources :

Example 21 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class YarnIntraNonHaMasterServicesTest method testClosingReportsToLeader.

@Test
public void testClosingReportsToLeader() throws Exception {
    final Configuration flinkConfig = new Configuration();
    try (YarnHighAvailabilityServices services = new YarnIntraNonHaMasterServices(flinkConfig, hadoopConfig)) {
        final LeaderElectionService elector = services.getResourceManagerLeaderElectionService();
        final LeaderRetrievalService retrieval = services.getResourceManagerLeaderRetriever();
        final LeaderContender contender = mockContender(elector);
        final LeaderRetrievalListener listener = mock(LeaderRetrievalListener.class);
        elector.start(contender);
        retrieval.start(listener);
        // wait until the contender has become the leader
        verify(listener, timeout(1000L).times(1)).notifyLeaderAddress(anyString(), any(UUID.class));
        // now we can close the election service
        services.close();
        verify(contender, timeout(1000L).times(1)).handleError(any(Exception.class));
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) LeaderContender(org.apache.flink.runtime.leaderelection.LeaderContender) LeaderElectionService(org.apache.flink.runtime.leaderelection.LeaderElectionService) LeaderRetrievalListener(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener) UUID(java.util.UUID) Test(org.junit.Test)

Example 22 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobLeaderService method removeJob.

/**
	 * Remove the given job from being monitored by the job leader service.
	 *
	 * @param jobId identifying the job to remove from monitoring
	 * @throws Exception if an error occurred while stopping the leader retrieval service and listener
	 */
public void removeJob(JobID jobId) throws Exception {
    Preconditions.checkState(JobLeaderService.State.STARTED == state, "The service is currently not running.");
    Tuple2<LeaderRetrievalService, JobLeaderService.JobManagerLeaderListener> entry = jobLeaderServices.remove(jobId);
    if (entry != null) {
        LOG.info("Remove job {} from job leader monitoring.", jobId);
        LeaderRetrievalService leaderRetrievalService = entry.f0;
        JobLeaderService.JobManagerLeaderListener jobManagerLeaderListener = entry.f1;
        leaderRetrievalService.stop();
        jobManagerLeaderListener.stop();
    }
}
Also used : LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)

Example 23 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobLeaderService method addJob.

/**
	 * Add the given job to be monitored. This means that the service tries to detect leaders for
	 * this job and then tries to establish a connection to it.
	 *
	 * @param jobId identifying the job to monitor
	 * @param defaultTargetAddress of the job leader
	 * @throws Exception if an error occurs while starting the leader retrieval service
	 */
public void addJob(final JobID jobId, final String defaultTargetAddress) throws Exception {
    Preconditions.checkState(JobLeaderService.State.STARTED == state, "The service is currently not running.");
    LOG.info("Add job {} for job leader monitoring.", jobId);
    final LeaderRetrievalService leaderRetrievalService = highAvailabilityServices.getJobManagerLeaderRetriever(jobId);
    JobLeaderService.JobManagerLeaderListener jobManagerLeaderListener = new JobManagerLeaderListener(jobId);
    leaderRetrievalService.start(jobManagerLeaderListener);
    jobLeaderServices.put(jobId, Tuple2.of(leaderRetrievalService, jobManagerLeaderListener));
}
Also used : LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)

Example 24 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.

/**
	 * Tests that the JobManager logs failures during recovery properly.
	 *
	 * @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
    final Deadline testDeadline = TestTimeOut.fromNow();
    final String zooKeeperQuorum = ZooKeeper.getConnectString();
    final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    ActorSystem testActorSystem = null;
    try {
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Get the leader
        leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
        ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        JobManagerProcess nonLeadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
            nonLeadingJobManagerProcess = jobManagerProcess[1];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
            nonLeadingJobManagerProcess = jobManagerProcess[0];
        }
        // Blocking JobGraph
        JobVertex blockingVertex = new JobVertex("Blocking vertex");
        blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
        JobGraph jobGraph = new JobGraph(blockingVertex);
        // Submit the job in detached mode
        leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to be running
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
        // Remove all files
        FileUtils.deleteDirectory(FileStateBackendBasePath);
        // Kill the leader
        leadingJobManagerProcess.destroy();
        // Verify that the job manager logs the failed recovery. We can not
        // do more at this point. :(
        boolean success = false;
        while (testDeadline.hasTimeLeft()) {
            String output = nonLeadingJobManagerProcess.getProcessOutput();
            if (output != null) {
                if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
                    success = true;
                    break;
                }
            } else {
                log.warn("No process output available.");
            }
            Thread.sleep(500);
        }
        assertTrue("Did not find expected output in logs.", success);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testActorSystem != null) {
            testActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 25 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.

/**
	 * Tests that clients receive updates after recovery by a new leader.
	 */
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Test actor system
    ActorSystem testSystem = null;
    // JobManager setup. Start the job managers as separate processes in order to not run the
    // actors postStop, which cleans up all running jobs.
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Client test actor
        TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
        JobGraph jobGraph = createBlockingJobGraph();
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // The client
            AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            int numSlots = 0;
            while (numSlots == 0) {
                Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
                numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
            }
            // Submit the job in non-detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
            // Cancel the job
            leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        }
        // Wait for the execution result
        clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
        int jobSubmitSuccessMessages = 0;
        for (Object msg : clientRef.underlyingActor().getMessages()) {
            if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
                jobSubmitSuccessMessages++;
            }
        }
        // At least two submissions should be ack-ed (initial and recovery). This is quite
        // conservative, but it is still possible that these messages are overtaken by the
        // final message.
        assertEquals(2, jobSubmitSuccessMessages);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) Future(scala.concurrent.Future) Test(org.junit.Test)

Aggregations

LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)27 Configuration (org.apache.flink.configuration.Configuration)18 Test (org.junit.Test)16 ActorSystem (akka.actor.ActorSystem)11 ActorRef (akka.actor.ActorRef)10 UUID (java.util.UUID)9 Deadline (scala.concurrent.duration.Deadline)8 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)7 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)6 FiniteDuration (scala.concurrent.duration.FiniteDuration)6 File (java.io.File)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)5 Props (akka.actor.Props)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)4 JobManagerProcess (org.apache.flink.runtime.testutils.JobManagerProcess)4 Some (scala.Some)4 JavaTestKit (akka.testkit.JavaTestKit)3 IOException (java.io.IOException)3