Search in sources :

Example 1 with JobManager

use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.

the class AbstractTaskManagerProcessFailureRecoveryTest method testTaskManagerProcessFailure.

@Test
public void testTaskManagerProcessFailure() {
    final StringWriter processOutput1 = new StringWriter();
    final StringWriter processOutput2 = new StringWriter();
    final StringWriter processOutput3 = new StringWriter();
    ActorSystem jmActorSystem = null;
    Process taskManagerProcess1 = null;
    Process taskManagerProcess2 = null;
    Process taskManagerProcess3 = null;
    File coordinateTempDir = null;
    try {
        // check that we run this test only if the java command
        // is available on this machine
        String javaCommand = getJavaCommandPath();
        if (javaCommand == null) {
            System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
            return;
        }
        // create a logging file for the process
        File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
        tempLogFile.deleteOnExit();
        CommonTestUtils.printLog4jDebugConfig(tempLogFile);
        // coordination between the processes goes through a directory
        coordinateTempDir = CommonTestUtils.createTempDirectory();
        // find a free port to start the JobManager
        final int jobManagerPort = NetUtils.getAvailablePort();
        // start a JobManager
        Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
        Configuration jmConfig = new Configuration();
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
        jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
        jmConfig.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "10 s");
        jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
        jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
        ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        // the TaskManager java command
        String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
        // start the first two TaskManager processes
        taskManagerProcess1 = new ProcessBuilder(command).start();
        new CommonTestUtils.PipeForwarder(taskManagerProcess1.getErrorStream(), processOutput1);
        taskManagerProcess2 = new ProcessBuilder(command).start();
        new CommonTestUtils.PipeForwarder(taskManagerProcess2.getErrorStream(), processOutput2);
        // we wait for the JobManager to have the two TaskManagers available
        // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
        waitUntilNumTaskManagersAreRegistered(jmActor, 2, 120000);
        // the program will set a marker file in each of its parallel tasks once they are ready, so that
        // this coordinating code is aware of this.
        // the program will very slowly consume elements until the marker file (later created by the
        // test driver code) is present
        final File coordinateDirClosure = coordinateTempDir;
        final AtomicReference<Throwable> errorRef = new AtomicReference<>();
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testTaskManagerFailure(jobManagerPort, coordinateDirClosure);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef.set(t);
                }
            }
        };
        //start the test program
        programTrigger.start();
        // max 20 seconds
        if (!waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, 120000)) {
            // check if the program failed for some reason
            if (errorRef.get() != null) {
                Throwable error = errorRef.get();
                error.printStackTrace();
                fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
            } else {
                // no error occurred, simply a timeout
                fail("The tasks were not started within time (" + 120000 + "msecs)");
            }
        }
        // start the third TaskManager
        taskManagerProcess3 = new ProcessBuilder(command).start();
        new CommonTestUtils.PipeForwarder(taskManagerProcess3.getErrorStream(), processOutput3);
        // we wait for the third TaskManager to register
        // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
        waitUntilNumTaskManagersAreRegistered(jmActor, 3, 120000);
        // kill one of the previous TaskManagers, triggering a failure and recovery
        taskManagerProcess1.destroy();
        taskManagerProcess1 = null;
        // we create the marker file which signals the program functions tasks that they can complete
        touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        // wait for at most 5 minutes for the program to complete
        programTrigger.join(300000);
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef.get() != null) {
            Throwable error = errorRef.get();
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    // all seems well :-)
    } catch (Exception e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", processOutput1.toString());
        printProcessLog("TaskManager 2", processOutput2.toString());
        printProcessLog("TaskManager 3", processOutput3.toString());
        fail(e.getMessage());
    } catch (Error e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", processOutput1.toString());
        printProcessLog("TaskManager 2", processOutput2.toString());
        printProcessLog("TaskManager 3", processOutput3.toString());
        throw e;
    } finally {
        if (taskManagerProcess1 != null) {
            taskManagerProcess1.destroy();
        }
        if (taskManagerProcess2 != null) {
            taskManagerProcess2.destroy();
        }
        if (taskManagerProcess3 != null) {
            taskManagerProcess3.destroy();
        }
        if (jmActorSystem != null) {
            jmActorSystem.shutdown();
        }
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable t) {
            // we can ignore this
            }
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) AtomicReference(java.util.concurrent.atomic.AtomicReference) JobManager(org.apache.flink.runtime.jobmanager.JobManager) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) Some(scala.Some) StringWriter(java.io.StringWriter) Tuple2(scala.Tuple2) File(java.io.File) Test(org.junit.Test)

Example 2 with JobManager

use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.

the class TaskManagerProcessReapingTestBase method testReapProcessOnFailure.

@Test
public void testReapProcessOnFailure() {
    Process taskManagerProcess = null;
    ActorSystem jmActorSystem = null;
    final StringWriter processOutput = new StringWriter();
    try {
        String javaCommand = getJavaCommandPath();
        // is available on this machine
        if (javaCommand == null) {
            System.out.println("---- Skipping TaskManagerProcessReapingTest : Could not find java executable ----");
            return;
        }
        // create a logging file for the process
        File tempLogFile = File.createTempFile("testlogconfig", "properties");
        tempLogFile.deleteOnExit();
        CommonTestUtils.printLog4jDebugConfig(tempLogFile);
        final int jobManagerPort = NetUtils.getAvailablePort();
        // start a JobManager
        Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
        jmActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<Tuple2<String, Object>>(localAddress));
        ActorRef jmActor = JobManager.startJobManagerActors(new Configuration(), jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1;
        // start a ResourceManager
        StandaloneLeaderRetrievalService standaloneLeaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(jmActorSystem, jmActor));
        FlinkResourceManager.startResourceManagerActors(new Configuration(), jmActorSystem, standaloneLeaderRetrievalService, StandaloneResourceManager.class);
        final int taskManagerPort = NetUtils.getAvailablePort();
        // start the task manager process
        String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms256m", "-Xmx256m", "-classpath", getCurrentClasspath(), TaskManagerTestEntryPoint.class.getName(), String.valueOf(jobManagerPort), String.valueOf(taskManagerPort) };
        ProcessBuilder bld = new ProcessBuilder(command);
        taskManagerProcess = bld.start();
        new PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
        // grab the reference to the TaskManager. try multiple times, until the process
        // is started and the TaskManager is up
        String taskManagerActorName = String.format("akka.tcp://flink@%s/user/%s", "localhost:" + taskManagerPort, TaskManager.TASK_MANAGER_NAME());
        ActorRef taskManagerRef = null;
        Throwable lastError = null;
        for (int i = 0; i < 40; i++) {
            try {
                taskManagerRef = TaskManager.getTaskManagerRemoteReference(taskManagerActorName, jmActorSystem, new FiniteDuration(25, TimeUnit.SECONDS));
                break;
            } catch (Throwable t) {
                // TaskManager probably not ready yet
                lastError = t;
            }
            Thread.sleep(500);
        }
        assertTrue("TaskManager process died", isProcessAlive(taskManagerProcess));
        if (taskManagerRef == null) {
            if (lastError != null) {
                lastError.printStackTrace();
            }
            fail("TaskManager process did not launch the TaskManager properly. Failed to look up " + taskManagerActorName);
        }
        // kill the TaskManager actor
        onTaskManagerProcessRunning(taskManagerRef);
        // wait for max 5 seconds for the process to terminate
        {
            long now = System.currentTimeMillis();
            long deadline = now + 10000;
            while (now < deadline && isProcessAlive(taskManagerProcess)) {
                Thread.sleep(100);
                now = System.currentTimeMillis();
            }
        }
        assertFalse("TaskManager process did not terminate upon actor death", isProcessAlive(taskManagerProcess));
        int returnCode = taskManagerProcess.exitValue();
        assertEquals("TaskManager died, but not because of the process reaper", TaskManager.RUNTIME_FAILURE_RETURN_CODE(), returnCode);
        onTaskManagerProcessTerminated(processOutput.toString());
    } catch (Exception e) {
        e.printStackTrace();
        printProcessLog(processOutput.toString());
        fail(e.getMessage());
    } catch (Error e) {
        e.printStackTrace();
        printProcessLog(processOutput.toString());
        throw e;
    } finally {
        if (taskManagerProcess != null) {
            taskManagerProcess.destroy();
        }
        if (jmActorSystem != null) {
            jmActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobManager(org.apache.flink.runtime.jobmanager.JobManager) IOException(java.io.IOException) Some(scala.Some) StringWriter(java.io.StringWriter) Tuple2(scala.Tuple2) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) File(java.io.File) Test(org.junit.Test)

Example 3 with JobManager

use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.

the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.

/**
	 * Tests the metric registry life cycle on JobManager re-connects.
	 */
@Test
public void testMetricRegistryLifeCycle() throws Exception {
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        // ================================================================
        // Start JobManager
        // ================================================================
        final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        // ================================================================
        // Start TaskManager
        // ================================================================
        final Configuration config = new Configuration();
        final ResourceID tmResourceID = ResourceID.generate();
        TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
        TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
        TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
        final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
        // create the task manager
        final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        // wait for the TM to be registered
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                        // trigger re-registration of TM; this should include a disconnect from the current JM
                        taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
                        // wait for re-registration to be completed
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // verify that the registry was not shutdown due to the disconnect
        Assert.assertFalse(tmRegistry.isShutdown());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) ActorRef(akka.actor.ActorRef) TaskManagerServices(org.apache.flink.runtime.taskexecutor.TaskManagerServices) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobManager(org.apache.flink.runtime.jobmanager.JobManager) Props(akka.actor.Props) TaskManagerMessages(org.apache.flink.runtime.messages.TaskManagerMessages) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 4 with JobManager

use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.

the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.

/**
	 * Makes sure that all components are shut down when the TaskManager
	 * actor is shut down.
	 */
@Test
public void testComponentsStartupShutdown() {
    final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
    final Time timeout = Time.seconds(100);
    final int BUFFER_SIZE = 32 * 1024;
    Configuration config = new Configuration();
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
    config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(config);
        final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
        final int numberOfSlots = 1;
        // create the components for the TaskManager manually
        final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
        1000000, config, // exit-jvm-on-fatal-error
        false);
        final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
        ResourceID taskManagerId = ResourceID.generate();
        final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
        final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
        final IOManager ioManager = new IOManagerAsync(TMP_DIR);
        final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
        network.start();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
        // create the task manager
        final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                // wait for the TaskManager to be registered
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // shut down all actors and the actor system
        // Kill the Task down the JobManager
        taskManager.tell(Kill.getInstance(), ActorRef.noSender());
        jobManager.tell(Kill.getInstance(), ActorRef.noSender());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
        actorSystem = null;
        // now that the TaskManager is shut down, the components should be shut down as well
        assertTrue(network.isShutdown());
        assertTrue(ioManager.isProperlyShutDown());
        assertTrue(memManager.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) ActorRef(akka.actor.ActorRef) Time(org.apache.flink.api.common.time.Time) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Props(akka.actor.Props) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FiniteDuration(scala.concurrent.duration.FiniteDuration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) NetworkBufferPool(org.apache.flink.runtime.io.network.buffer.NetworkBufferPool) LocalConnectionManager(org.apache.flink.runtime.io.network.LocalConnectionManager) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 5 with JobManager

use of org.apache.flink.runtime.jobmanager.JobManager in project flink by apache.

the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.

@Test
public void testCancelingOnProcessFailure() {
    final StringWriter processOutput = new StringWriter();
    ActorSystem jmActorSystem = null;
    Process taskManagerProcess = null;
    try {
        // check that we run this test only if the java command
        // is available on this machine
        String javaCommand = getJavaCommandPath();
        if (javaCommand == null) {
            System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
            return;
        }
        // create a logging file for the process
        File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
        tempLogFile.deleteOnExit();
        CommonTestUtils.printLog4jDebugConfig(tempLogFile);
        // find a free port to start the JobManager
        final int jobManagerPort = NetUtils.getAvailablePort();
        // start a JobManager
        Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
        Configuration jmConfig = new Configuration();
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "5 s");
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2000 s");
        jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 10);
        jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
        jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
        ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        // the TaskManager java command
        String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), AbstractTaskManagerProcessFailureRecoveryTest.TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
        // start the first two TaskManager processes
        taskManagerProcess = new ProcessBuilder(command).start();
        new CommonTestUtils.PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
        // we wait for the JobManager to have the two TaskManagers available
        // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
        waitUntilNumTaskManagersAreRegistered(jmActor, 1, 120000);
        final Throwable[] errorRef = new Throwable[1];
        // start the test program, which infinitely blocks 
        Runnable programRunner = new Runnable() {

            @Override
            public void run() {
                try {
                    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort);
                    env.setParallelism(2);
                    env.setRestartStrategy(RestartStrategies.noRestart());
                    env.getConfig().disableSysoutLogging();
                    env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {

                        @Override
                        public Long map(Long value) throws Exception {
                            synchronized (this) {
                                wait();
                            }
                            return 0L;
                        }
                    }).output(new DiscardingOutputFormat<Long>());
                    env.execute();
                } catch (Throwable t) {
                    errorRef[0] = t;
                }
            }
        };
        Thread programThread = new Thread(programRunner);
        // kill the TaskManager
        taskManagerProcess.destroy();
        taskManagerProcess = null;
        // immediately submit the job. this should hit the case
        // where the JobManager still thinks it has the TaskManager and tries to send it tasks
        programThread.start();
        // try to cancel the job
        cancelRunningJob(jmActor);
        // we should see a failure within reasonable time (10s is the ask timeout).
        // since the CI environment is often slow, we conservatively give it up to 2 minutes, 
        // to fail, which is much lower than the failure time given by the heartbeats ( > 2000s)
        programThread.join(120000);
        assertFalse("The program did not cancel in time (2 minutes)", programThread.isAlive());
        Throwable error = errorRef[0];
        assertNotNull("The program did not fail properly", error);
        assertTrue(error instanceof ProgramInvocationException);
    // all seems well :-)
    } catch (Exception e) {
        e.printStackTrace();
        printProcessLog("TaskManager", processOutput.toString());
        fail(e.getMessage());
    } catch (Error e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", processOutput.toString());
        throw e;
    } finally {
        if (taskManagerProcess != null) {
            taskManagerProcess.destroy();
        }
        if (jmActorSystem != null) {
            jmActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MapFunction(org.apache.flink.api.common.functions.MapFunction) StringWriter(java.io.StringWriter) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) TimeoutException(java.util.concurrent.TimeoutException) Some(scala.Some) Tuple2(scala.Tuple2) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) File(java.io.File) Test(org.junit.Test)

Aggregations

ActorRef (akka.actor.ActorRef)5 ActorSystem (akka.actor.ActorSystem)5 Configuration (org.apache.flink.configuration.Configuration)5 JobManager (org.apache.flink.runtime.jobmanager.JobManager)5 MemoryArchivist (org.apache.flink.runtime.jobmanager.MemoryArchivist)5 Test (org.junit.Test)5 File (java.io.File)3 StringWriter (java.io.StringWriter)3 StandaloneLeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService)3 Some (scala.Some)3 Tuple2 (scala.Tuple2)3 FiniteDuration (scala.concurrent.duration.FiniteDuration)3 Props (akka.actor.Props)2 JavaTestKit (akka.testkit.JavaTestKit)2 IOException (java.io.IOException)2 TimeoutException (java.util.concurrent.TimeoutException)2 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)2 LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)2 TaskManagerConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration)2 AtomicReference (java.util.concurrent.atomic.AtomicReference)1