Search in sources :

Example 1 with TestingListener

use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.

/**
	 * Simple checkpointed streaming sum.
	 *
	 * <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
	 * returns it to the main thread via a static variable. We wait until some checkpoints are
	 * completed and sanity check that the sources recover with an updated state to make sure that
	 * this test actually tests something.
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
    // Config
    final int checkpointingInterval = 200;
    final int sequenceEnd = 5000;
    final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(Parallelism);
    env.enableCheckpointing(checkpointingInterval);
    env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
    ActorSystem testSystem = null;
    final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            // Submit the job in detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        CompletedCheckpointsLatch.await();
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Wait to finish
        FinalCountLatch.await();
        assertEquals(expectedSum, (long) FinalCount.get());
        for (int i = 0; i < Parallelism; i++) {
            assertNotEquals(0, RecoveredStates.get(i));
        }
    } catch (Throwable t) {
        // Reset all static state for test retries
        CompletedCheckpointsLatch = new CountDownLatch(2);
        RecoveredStates = new AtomicLongArray(Parallelism);
        FinalCountLatch = new CountDownLatch(1);
        FinalCount = new AtomicReference<>();
        LastElement = -1;
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 2 with TestingListener

use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.

the class KubernetesMultipleComponentLeaderElectionDriverTest method testPublishLeaderInformation.

@Test
public void testPublishLeaderInformation() throws Exception {
    new TestFixture() {

        {
            runTest(() -> {
                leaderCallbackGrantLeadership();
                leaderElectionListener.await(LeaderElectionEvent.IsLeaderEvent.class);
                final LeaderInformation leaderInformation = LeaderInformation.known(UUID.randomUUID(), "localhost");
                final String componentId = "componentId";
                final DefaultLeaderRetrievalService leaderRetrievalService = new DefaultLeaderRetrievalService(new KubernetesMultipleComponentLeaderRetrievalDriverFactory(getFlinkKubeClient(), getConfigMapSharedWatcher(), testExecutorExtension.getExecutor(), LEADER_CONFIGMAP_NAME, componentId));
                final TestingListener leaderRetrievalListener = new TestingListener();
                leaderRetrievalService.start(leaderRetrievalListener);
                leaderElectionDriver.publishLeaderInformation(componentId, leaderInformation);
                notifyLeaderRetrievalWatchOnModifiedConfigMap();
                leaderRetrievalListener.waitForNewLeader(10_000L);
                assertThat(leaderRetrievalListener.getLeader()).isEqualTo(leaderInformation);
            });
        }
    };
}
Also used : TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) LeaderElectionEvent(org.apache.flink.runtime.leaderelection.LeaderElectionEvent) DefaultLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.DefaultLeaderRetrievalService) LeaderInformation(org.apache.flink.runtime.leaderelection.LeaderInformation) Test(org.junit.jupiter.api.Test)

Example 3 with TestingListener

use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.

the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.

@Test
public void testDispatcherProcessFailure() throws Exception {
    final Time timeout = Time.seconds(30L);
    final File zookeeperStoragePath = temporaryFolder.newFolder();
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Job managers
    final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
    // Task managers
    TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
    HighAvailabilityServices highAvailabilityServices = null;
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    // Cluster config
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
    // Task manager configuration
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    TaskExecutorResourceUtils.adjustForLocalExecution(config);
    final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
    try {
        final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
        // Coordination directory
        coordinateTempDir = temporaryFolder.newFolder();
        // Start first process
        dispatcherProcesses[0] = new DispatcherProcess(0, config);
        dispatcherProcesses[0].startProcess();
        highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
        final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
            taskManagerRunners[i].start();
        }
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
        final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
        // Wait for all task managers to connect to the leading job manager
        waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        // start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        dispatcherProcesses[0].destroy();
        dispatcherProcesses[1] = new DispatcherProcess(1, config);
        dispatcherProcesses[1].startProcess();
        // we create the marker file which signals the program functions tasks that they can
        // complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (DispatcherProcess p : dispatcherProcesses) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (taskManagerRunners[i] != null) {
                taskManagerRunners[i].close();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
            if (dispatcherProcess != null) {
                dispatcherProcess.destroy();
            }
        }
        if (highAvailabilityServices != null) {
            highAvailabilityServices.closeAndCleanupAllData();
        }
        RpcUtils.terminateRpcService(rpcService, timeout);
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Deadline(org.apache.flink.api.common.time.Deadline) Time(org.apache.flink.api.common.time.Time) DispatcherProcess(org.apache.flink.runtime.testutils.DispatcherProcess) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) PluginManager(org.apache.flink.core.plugin.PluginManager) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) TaskManagerRunner(org.apache.flink.runtime.taskexecutor.TaskManagerRunner) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) RpcService(org.apache.flink.runtime.rpc.RpcService) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Example 4 with TestingListener

use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.

the class WebRuntimeMonitorITCase method testRedirectToLeader.

/**
	 * Tests that the monitor associated with the following job manager redirects to the leader.
	 */
@Test
public void testRedirectToLeader() throws Exception {
    final Deadline deadline = TestTimeout.fromNow();
    ActorSystem[] jobManagerSystem = new ActorSystem[2];
    WebRuntimeMonitor[] webMonitor = new WebRuntimeMonitor[2];
    List<LeaderRetrievalService> leaderRetrievalServices = new ArrayList<>();
    try (TestingServer zooKeeper = new TestingServer()) {
        final Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), temporaryFolder.getRoot().getPath());
        File logDir = temporaryFolder.newFolder();
        Path logFile = Files.createFile(new File(logDir, "jobmanager.log").toPath());
        Files.createFile(new File(logDir, "jobmanager.out").toPath());
        config.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0);
        config.setString(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY, logFile.toString());
        for (int i = 0; i < jobManagerSystem.length; i++) {
            jobManagerSystem[i] = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        }
        for (int i = 0; i < webMonitor.length; i++) {
            LeaderRetrievalService lrs = ZooKeeperUtils.createLeaderRetrievalService(config);
            leaderRetrievalServices.add(lrs);
            webMonitor[i] = new WebRuntimeMonitor(config, lrs, jobManagerSystem[i]);
        }
        ActorRef[] jobManager = new ActorRef[2];
        String[] jobManagerAddress = new String[2];
        for (int i = 0; i < jobManager.length; i++) {
            Configuration jmConfig = config.clone();
            jmConfig.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, webMonitor[i].getServerPort());
            jobManager[i] = JobManager.startJobManagerActors(jmConfig, jobManagerSystem[i], TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
            jobManagerAddress[i] = AkkaUtils.getAkkaURL(jobManagerSystem[i], jobManager[i]);
            webMonitor[i].start(jobManagerAddress[i]);
        }
        LeaderRetrievalService lrs = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalServices.add(lrs);
        TestingListener leaderListener = new TestingListener();
        lrs.start(leaderListener);
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        int leaderIndex = leaderAddress.equals(jobManagerAddress[0]) ? 0 : 1;
        int followerIndex = (leaderIndex + 1) % 2;
        ActorSystem leadingSystem = jobManagerSystem[leaderIndex];
        ActorSystem followerSystem = jobManagerSystem[followerIndex];
        WebMonitor leadingWebMonitor = webMonitor[leaderIndex];
        WebMonitor followerWebMonitor = webMonitor[followerIndex];
        // For test stability reason we have to wait until we are sure that both leader
        // listeners have been notified.
        JobManagerRetriever leadingRetriever = Whitebox.getInternalState(leadingWebMonitor, "retriever");
        JobManagerRetriever followerRetriever = Whitebox.getInternalState(followerWebMonitor, "retriever");
        // Wait for the initial notifications
        waitForLeaderNotification(leadingSystem, jobManager[leaderIndex], leadingRetriever, deadline);
        waitForLeaderNotification(leadingSystem, jobManager[leaderIndex], followerRetriever, deadline);
        try (HttpTestClient leaderClient = new HttpTestClient("localhost", leadingWebMonitor.getServerPort());
            HttpTestClient followingClient = new HttpTestClient("localhost", followerWebMonitor.getServerPort())) {
            String expected = new Scanner(new File(MAIN_RESOURCES_PATH + "/index.html")).useDelimiter("\\A").next();
            // Request the file from the leading web server
            leaderClient.sendGetRequest("index.html", deadline.timeLeft());
            HttpTestClient.SimpleHttpResponse response = leaderClient.getNextResponse(deadline.timeLeft());
            assertEquals(HttpResponseStatus.OK, response.getStatus());
            assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("html"));
            assertEquals(expected, response.getContent());
            // Request the file from the following web server
            followingClient.sendGetRequest("index.html", deadline.timeLeft());
            response = followingClient.getNextResponse(deadline.timeLeft());
            assertEquals(HttpResponseStatus.TEMPORARY_REDIRECT, response.getStatus());
            assertTrue(response.getLocation().contains(String.valueOf(leadingWebMonitor.getServerPort())));
            // Kill the leader
            leadingSystem.shutdown();
            // Wait for the notification of the follower
            waitForLeaderNotification(followerSystem, jobManager[followerIndex], followerRetriever, deadline);
            // Same request to the new leader
            followingClient.sendGetRequest("index.html", deadline.timeLeft());
            response = followingClient.getNextResponse(deadline.timeLeft());
            assertEquals(HttpResponseStatus.OK, response.getStatus());
            assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("html"));
            assertEquals(expected, response.getContent());
            // Simple overview request
            followingClient.sendGetRequest("/overview", deadline.timeLeft());
            response = followingClient.getNextResponse(deadline.timeLeft());
            assertEquals(HttpResponseStatus.OK, response.getStatus());
            assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("json"));
            assertTrue(response.getContent().contains("\"taskmanagers\":1") || response.getContent().contains("\"taskmanagers\":0"));
        }
    } finally {
        for (ActorSystem system : jobManagerSystem) {
            if (system != null) {
                system.shutdown();
            }
        }
        for (WebMonitor monitor : webMonitor) {
            monitor.stop();
        }
        for (LeaderRetrievalService lrs : leaderRetrievalServices) {
            lrs.stop();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) Scanner(java.util.Scanner) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) ArrayList(java.util.ArrayList) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) HttpTestClient(org.apache.flink.runtime.webmonitor.testutils.HttpTestClient) TestingServer(org.apache.curator.test.TestingServer) Path(java.nio.file.Path) Deadline(scala.concurrent.duration.Deadline) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) File(java.io.File) Test(org.junit.Test)

Example 5 with TestingListener

use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.

/**
	 * Tests that the JobManager logs failures during recovery properly.
	 *
	 * @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
    final Deadline testDeadline = TestTimeOut.fromNow();
    final String zooKeeperQuorum = ZooKeeper.getConnectString();
    final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    ActorSystem testActorSystem = null;
    try {
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Get the leader
        leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
        ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        JobManagerProcess nonLeadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
            nonLeadingJobManagerProcess = jobManagerProcess[1];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
            nonLeadingJobManagerProcess = jobManagerProcess[0];
        }
        // Blocking JobGraph
        JobVertex blockingVertex = new JobVertex("Blocking vertex");
        blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
        JobGraph jobGraph = new JobGraph(blockingVertex);
        // Submit the job in detached mode
        leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to be running
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
        // Remove all files
        FileUtils.deleteDirectory(FileStateBackendBasePath);
        // Kill the leader
        leadingJobManagerProcess.destroy();
        // Verify that the job manager logs the failed recovery. We can not
        // do more at this point. :(
        boolean success = false;
        while (testDeadline.hasTimeLeft()) {
            String output = nonLeadingJobManagerProcess.getProcessOutput();
            if (output != null) {
                if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
                    success = true;
                    break;
                }
            } else {
                log.warn("No process output available.");
            }
            Thread.sleep(500);
        }
        assertTrue("Did not find expected output in logs.", success);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testActorSystem != null) {
            testActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Aggregations

TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)10 Test (org.junit.Test)9 Configuration (org.apache.flink.configuration.Configuration)7 LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)7 ActorSystem (akka.actor.ActorSystem)6 Deadline (scala.concurrent.duration.Deadline)6 ActorRef (akka.actor.ActorRef)5 UUID (java.util.UUID)5 JobManagerProcess (org.apache.flink.runtime.testutils.JobManagerProcess)5 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)4 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)4 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)4 Some (scala.Some)4 File (java.io.File)3 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)3 ArrayList (java.util.ArrayList)2 RetryOnFailure (org.apache.flink.testutils.junit.RetryOnFailure)2 TestActorRef (akka.testkit.TestActorRef)1 Path (java.nio.file.Path)1 Scanner (java.util.Scanner)1