Search in sources :

Example 71 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.

/**
	 * Tests that clients receive updates after recovery by a new leader.
	 */
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Test actor system
    ActorSystem testSystem = null;
    // JobManager setup. Start the job managers as separate processes in order to not run the
    // actors postStop, which cleans up all running jobs.
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Client test actor
        TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
        JobGraph jobGraph = createBlockingJobGraph();
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // The client
            AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            int numSlots = 0;
            while (numSlots == 0) {
                Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
                numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
            }
            // Submit the job in non-detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
            // Cancel the job
            leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        }
        // Wait for the execution result
        clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
        int jobSubmitSuccessMessages = 0;
        for (Object msg : clientRef.underlyingActor().getMessages()) {
            if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
                jobSubmitSuccessMessages++;
            }
        }
        // At least two submissions should be ack-ed (initial and recovery). This is quite
        // conservative, but it is still possible that these messages are overtaken by the
        // final message.
        assertEquals(2, jobSubmitSuccessMessages);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) Future(scala.concurrent.Future) Test(org.junit.Test)

Example 72 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.

@Test
public void testJobManagerProcessFailure() throws Exception {
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Setup
    // Test actor system
    ActorSystem testActorSystem;
    // Job managers
    final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
    // Task managers
    final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
    // Leader election service
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Coordination directory
        coordinateTempDir = createTempDirectory();
        // Job Managers
        Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
        // Start first process
        jmProcess[0] = new JobManagerProcess(0, config);
        jmProcess[0].startProcess();
        // Task manager configuration
        config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
        config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
            TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        }
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
        ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
        // Wait for all task managers to connect to the leading job manager
        JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        //start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        jmProcess[0].destroy();
        jmProcess[1] = new JobManagerProcess(1, config);
        jmProcess[1].startProcess();
        jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
        // we create the marker file which signals the program functions tasks that they can complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (JobManagerProcess p : jmProcess) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (tmActorSystem[i] != null) {
                tmActorSystem[i].shutdown();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (JobManagerProcess jmProces : jmProcess) {
            if (jmProces != null) {
                jmProces.destroy();
            }
        }
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Example 73 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.

@Test
public void testCancelingOnProcessFailure() {
    final StringWriter processOutput = new StringWriter();
    ActorSystem jmActorSystem = null;
    Process taskManagerProcess = null;
    try {
        // check that we run this test only if the java command
        // is available on this machine
        String javaCommand = getJavaCommandPath();
        if (javaCommand == null) {
            System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
            return;
        }
        // create a logging file for the process
        File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
        tempLogFile.deleteOnExit();
        CommonTestUtils.printLog4jDebugConfig(tempLogFile);
        // find a free port to start the JobManager
        final int jobManagerPort = NetUtils.getAvailablePort();
        // start a JobManager
        Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
        Configuration jmConfig = new Configuration();
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "5 s");
        jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2000 s");
        jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 10);
        jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
        jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
        ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        // the TaskManager java command
        String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), AbstractTaskManagerProcessFailureRecoveryTest.TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
        // start the first two TaskManager processes
        taskManagerProcess = new ProcessBuilder(command).start();
        new CommonTestUtils.PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
        // we wait for the JobManager to have the two TaskManagers available
        // since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
        waitUntilNumTaskManagersAreRegistered(jmActor, 1, 120000);
        final Throwable[] errorRef = new Throwable[1];
        // start the test program, which infinitely blocks 
        Runnable programRunner = new Runnable() {

            @Override
            public void run() {
                try {
                    ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort);
                    env.setParallelism(2);
                    env.setRestartStrategy(RestartStrategies.noRestart());
                    env.getConfig().disableSysoutLogging();
                    env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {

                        @Override
                        public Long map(Long value) throws Exception {
                            synchronized (this) {
                                wait();
                            }
                            return 0L;
                        }
                    }).output(new DiscardingOutputFormat<Long>());
                    env.execute();
                } catch (Throwable t) {
                    errorRef[0] = t;
                }
            }
        };
        Thread programThread = new Thread(programRunner);
        // kill the TaskManager
        taskManagerProcess.destroy();
        taskManagerProcess = null;
        // immediately submit the job. this should hit the case
        // where the JobManager still thinks it has the TaskManager and tries to send it tasks
        programThread.start();
        // try to cancel the job
        cancelRunningJob(jmActor);
        // we should see a failure within reasonable time (10s is the ask timeout).
        // since the CI environment is often slow, we conservatively give it up to 2 minutes, 
        // to fail, which is much lower than the failure time given by the heartbeats ( > 2000s)
        programThread.join(120000);
        assertFalse("The program did not cancel in time (2 minutes)", programThread.isAlive());
        Throwable error = errorRef[0];
        assertNotNull("The program did not fail properly", error);
        assertTrue(error instanceof ProgramInvocationException);
    // all seems well :-)
    } catch (Exception e) {
        e.printStackTrace();
        printProcessLog("TaskManager", processOutput.toString());
        fail(e.getMessage());
    } catch (Error e) {
        e.printStackTrace();
        printProcessLog("TaskManager 1", processOutput.toString());
        throw e;
    } finally {
        if (taskManagerProcess != null) {
            taskManagerProcess.destroy();
        }
        if (jmActorSystem != null) {
            jmActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MapFunction(org.apache.flink.api.common.functions.MapFunction) StringWriter(java.io.StringWriter) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) TimeoutException(java.util.concurrent.TimeoutException) Some(scala.Some) Tuple2(scala.Tuple2) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) File(java.io.File) Test(org.junit.Test)

Example 74 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class ChaosMonkeyITCase method submitJobGraph.

// - Utilities ---------------------------------------------------------------------------------
private void submitJobGraph(JobGraph jobGraph, JobManagerProcess jobManager, TestingListener leaderListener, ActorSystem actorSystem, FiniteDuration timeout) throws Exception {
    ActorRef jobManagerRef = jobManager.getActorRef(actorSystem, timeout);
    UUID jobManagerLeaderId = leaderListener.getLeaderSessionID();
    AkkaActorGateway jobManagerGateway = new AkkaActorGateway(jobManagerRef, jobManagerLeaderId);
    jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorRef(akka.actor.ActorRef) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) UUID(java.util.UUID)

Example 75 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class ChaosMonkeyITCase method waitForJobRemoved.

private void waitForJobRemoved(JobID jobId, JobManagerProcess jobManager, ActorSystem actorSystem, FiniteDuration timeout) throws Exception {
    ActorRef jobManagerRef = jobManager.getActorRef(actorSystem, timeout);
    AkkaActorGateway jobManagerGateway = new AkkaActorGateway(jobManagerRef, null);
    Future<Object> archiveFuture = jobManagerGateway.ask(JobManagerMessages.getRequestArchive(), timeout);
    ActorRef archive = ((JobManagerMessages.ResponseArchive) Await.result(archiveFuture, timeout)).actor();
    AkkaActorGateway archiveGateway = new AkkaActorGateway(archive, null);
    Deadline deadline = timeout.fromNow();
    while (deadline.hasTimeLeft()) {
        JobManagerMessages.JobStatusResponse resp = JobManagerActorTestUtils.requestJobStatus(jobId, archiveGateway, deadline.timeLeft());
        if (resp instanceof JobManagerMessages.JobNotFound) {
            Thread.sleep(100);
        } else {
            return;
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages)

Aggregations

ActorRef (akka.actor.ActorRef)79 Test (org.junit.Test)53 Configuration (org.apache.flink.configuration.Configuration)43 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)33 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)30 ActorSystem (akka.actor.ActorSystem)28 FiniteDuration (scala.concurrent.duration.FiniteDuration)26 JobID (org.apache.flink.api.common.JobID)22 Props (akka.actor.Props)20 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)18 UUID (java.util.UUID)17 JavaTestKit (akka.testkit.JavaTestKit)16 IOException (java.io.IOException)15 TaskManagerServicesConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration)15 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)13 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)13 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)13 File (java.io.File)12 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)12 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)12