Search in sources :

Example 1 with DispatcherProcess

use of org.apache.flink.runtime.testutils.DispatcherProcess in project flink by apache.

the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.

@Test
public void testDispatcherProcessFailure() throws Exception {
    final Time timeout = Time.seconds(30L);
    final File zookeeperStoragePath = temporaryFolder.newFolder();
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Job managers
    final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
    // Task managers
    TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
    HighAvailabilityServices highAvailabilityServices = null;
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    // Cluster config
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
    // Task manager configuration
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    TaskExecutorResourceUtils.adjustForLocalExecution(config);
    final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
    try {
        final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
        // Coordination directory
        coordinateTempDir = temporaryFolder.newFolder();
        // Start first process
        dispatcherProcesses[0] = new DispatcherProcess(0, config);
        dispatcherProcesses[0].startProcess();
        highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
        final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
            taskManagerRunners[i].start();
        }
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
        final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
        // Wait for all task managers to connect to the leading job manager
        waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        // start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        dispatcherProcesses[0].destroy();
        dispatcherProcesses[1] = new DispatcherProcess(1, config);
        dispatcherProcesses[1].startProcess();
        // we create the marker file which signals the program functions tasks that they can
        // complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (DispatcherProcess p : dispatcherProcesses) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (taskManagerRunners[i] != null) {
                taskManagerRunners[i].close();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
            if (dispatcherProcess != null) {
                dispatcherProcess.destroy();
            }
        }
        if (highAvailabilityServices != null) {
            highAvailabilityServices.closeAndCleanupAllData();
        }
        RpcUtils.terminateRpcService(rpcService, timeout);
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Deadline(org.apache.flink.api.common.time.Deadline) Time(org.apache.flink.api.common.time.Time) DispatcherProcess(org.apache.flink.runtime.testutils.DispatcherProcess) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) PluginManager(org.apache.flink.core.plugin.PluginManager) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) TaskManagerRunner(org.apache.flink.runtime.taskexecutor.TaskManagerRunner) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) RpcService(org.apache.flink.runtime.rpc.RpcService) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Aggregations

File (java.io.File)1 UUID (java.util.UUID)1 Deadline (org.apache.flink.api.common.time.Deadline)1 Time (org.apache.flink.api.common.time.Time)1 Configuration (org.apache.flink.configuration.Configuration)1 PluginManager (org.apache.flink.core.plugin.PluginManager)1 DispatcherGateway (org.apache.flink.runtime.dispatcher.DispatcherGateway)1 HighAvailabilityServices (org.apache.flink.runtime.highavailability.HighAvailabilityServices)1 TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)1 LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)1 RpcService (org.apache.flink.runtime.rpc.RpcService)1 TaskManagerRunner (org.apache.flink.runtime.taskexecutor.TaskManagerRunner)1 DispatcherProcess (org.apache.flink.runtime.testutils.DispatcherProcess)1 Test (org.junit.Test)1