Search in sources :

Example 26 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.

@Test
public void testJobManagerProcessFailure() throws Exception {
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Setup
    // Test actor system
    ActorSystem testActorSystem;
    // Job managers
    final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
    // Task managers
    final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
    // Leader election service
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Coordination directory
        coordinateTempDir = createTempDirectory();
        // Job Managers
        Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
        // Start first process
        jmProcess[0] = new JobManagerProcess(0, config);
        jmProcess[0].startProcess();
        // Task manager configuration
        config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
        config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
            TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        }
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
        ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
        // Wait for all task managers to connect to the leading job manager
        JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        //start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        jmProcess[0].destroy();
        jmProcess[1] = new JobManagerProcess(1, config);
        jmProcess[1].startProcess();
        jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
        // we create the marker file which signals the program functions tasks that they can complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (JobManagerProcess p : jmProcess) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (tmActorSystem[i] != null) {
                tmActorSystem[i].shutdown();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (JobManagerProcess jmProces : jmProcess) {
            if (jmProces != null) {
                jmProces.destroy();
            }
        }
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Example 27 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class ChaosMonkeyITCase method testChaosMonkey.

@Test
public void testChaosMonkey() throws Exception {
    // Test config
    final int numberOfJobManagers = 3;
    final int numberOfTaskManagers = 3;
    final int numberOfSlotsPerTaskManager = 2;
    // The final count each source is counting to: 1...n
    final int n = 5000;
    // Parallelism for the program
    final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
    // The test should not run longer than this
    final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
    // Every x seconds a random job or task manager is killed
    //
    // The job will will be running for $killEvery seconds and then a random Job/TaskManager
    // will be killed. On recovery (which takes some time to bring up the new process etc.),
    // this test will wait for task managers to reconnect before starting the next count down.
    // Therefore the delay between retries is not important in this setup.
    final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
    // Trigger a checkpoint every
    final int checkpointingIntervalMs = 1000;
    // Total number of kills
    final int totalNumberOfKills = 10;
    // -----------------------------------------------------------------------------------------
    // Setup
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
    // Akka and restart timeouts
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
    config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
    if (checkpointingIntervalMs >= killEvery.toMillis()) {
        throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
    }
    // Task manager
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
    ActorSystem testActorSystem = null;
    LeaderRetrievalService leaderRetrievalService = null;
    List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
    List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
    try {
        // Initial state
        for (int i = 0; i < numberOfJobManagers; i++) {
            jobManagerProcesses.add(createAndStartJobManagerProcess(config));
        }
        for (int i = 0; i < numberOfTaskManagers; i++) {
            taskManagerProcesses.add(createAndStartTaskManagerProcess(config));
        }
        testActorSystem = AkkaUtils.createDefaultActorSystem();
        // Leader listener
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService.start(leaderListener);
        Deadline deadline = testDuration.fromNow();
        // Wait for the new leader
        int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
        // Wait for the task managers to connect
        waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
        // The job
        JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);
        LOG.info("Submitting job {}", jobGraph.getJobID());
        submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());
        LOG.info("Waiting for a checkpoint to complete before kicking off chaos");
        // Wait for a checkpoint to complete
        TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());
        LOG.info("Checkpoint completed... ready for chaos");
        int currentKillNumber = 1;
        int currentJobManagerKills = 0;
        int currentTaskManagerKills = 0;
        for (int i = 0; i < totalNumberOfKills; i++) {
            LOG.info("Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
            Thread.sleep(killEvery.toMillis());
            LOG.info("Checking job status...");
            JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
            if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
                // Wait for it to run
                LOG.info("Waiting for job status {}", JobStatus.RUNNING);
                waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
            } else if (jobStatus == JobStatus.FINISHED) {
                // Early finish
                LOG.info("Job finished");
                return;
            } else {
                LOG.info("Job status is {}", jobStatus);
            }
            if (rand.nextBoolean()) {
                LOG.info("Killing the leading JobManager");
                JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
                JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
                leader.destroy();
                currentJobManagerKills++;
                LOG.info("Killed {}", leader);
                // Make sure to add the new job manager before looking for a new leader
                jobManagerProcesses.add(newJobManager);
                // Wait for the new leader
                leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
                // Wait for the task managers to connect
                waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
            } else {
                LOG.info("Killing a random TaskManager");
                TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
                // Wait for this new task manager to be connected
                waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
                // Now it's safe to kill a process
                int next = rand.nextInt(numberOfTaskManagers);
                TaskManagerProcess taskManager = taskManagerProcesses.remove(next);
                LOG.info("{} has been chosen. Killing process...", taskManager);
                taskManager.destroy();
                currentTaskManagerKills++;
                // Add the new task manager after killing an old one
                taskManagerProcesses.add(newTaskManager);
            }
        }
        LOG.info("Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
        // Signal the job to speed up (if it is not done yet)
        TestJvmProcess.touchFile(ProceedCoordination);
        // Wait for the job to finish
        LOG.info("Waiting for job status {}", JobStatus.FINISHED);
        waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
        LOG.info("Job finished");
        LOG.info("Waiting for job removal");
        waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
        LOG.info("Job removed");
        LOG.info("Checking clean recovery state...");
        checkCleanRecoveryState(config);
        LOG.info("Recovery state clean");
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        System.out.println("#################################################");
        System.out.println(" TASK MANAGERS");
        System.out.println("#################################################");
        for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
            taskManagerProcess.printProcessLog();
        }
        System.out.println("#################################################");
        System.out.println(" JOB MANAGERS");
        System.out.println("#################################################");
        for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
            jobManagerProcess.printProcessLog();
        }
        throw t;
    } finally {
        for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
            if (jobManagerProcess != null) {
                jobManagerProcess.destroy();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (testActorSystem != null) {
            testActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) TaskManagerProcess(org.apache.flink.runtime.testutils.TaskManagerProcess) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) ArrayList(java.util.ArrayList) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) Test(org.junit.Test)

Aggregations

LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)27 Configuration (org.apache.flink.configuration.Configuration)18 Test (org.junit.Test)16 ActorSystem (akka.actor.ActorSystem)11 ActorRef (akka.actor.ActorRef)10 UUID (java.util.UUID)9 Deadline (scala.concurrent.duration.Deadline)8 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)7 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)6 FiniteDuration (scala.concurrent.duration.FiniteDuration)6 File (java.io.File)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)5 Props (akka.actor.Props)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)4 JobManagerProcess (org.apache.flink.runtime.testutils.JobManagerProcess)4 Some (scala.Some)4 JavaTestKit (akka.testkit.JavaTestKit)3 IOException (java.io.IOException)3