Search in sources :

Example 31 with AkkaActorGateway

use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.

the class AccumulatorLiveITCase method verifyResults.

private static void verifyResults() {
    new JavaTestKit(system) {

        {
            ActorGateway selfGateway = new AkkaActorGateway(getRef(), jobManagerGateway.leaderSessionID());
            // register for accumulator changes
            jobManagerGateway.tell(new TestingJobManagerMessages.NotifyWhenAccumulatorChange(jobID), selfGateway);
            expectMsgEquals(TIMEOUT, true);
            // submit job
            jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT), selfGateway);
            expectMsgClass(TIMEOUT, JobManagerMessages.JobSubmitSuccess.class);
            TestingJobManagerMessages.UpdatedAccumulators msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
            Map<String, Accumulator<?, ?>> userAccumulators = msg.userAccumulators();
            ExecutionAttemptID mapperTaskID = null;
            ExecutionAttemptID sinkTaskID = null;
            /* Check for accumulator values */
            if (checkUserAccumulators(0, userAccumulators)) {
                LOG.info("Passed initial check for map task.");
            } else {
                fail("Wrong accumulator results when map task begins execution.");
            }
            int expectedAccVal = 0;
            /* for mapper task */
            for (int i = 1; i <= NUM_ITERATIONS; i++) {
                expectedAccVal += i;
                // receive message
                msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
                userAccumulators = msg.userAccumulators();
                LOG.info("{}", userAccumulators);
                if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    LOG.info("Passed round #" + i);
                } else if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    // we determined the wrong task id and need to switch the two here
                    ExecutionAttemptID temp = mapperTaskID;
                    mapperTaskID = sinkTaskID;
                    sinkTaskID = temp;
                    LOG.info("Passed round #" + i);
                } else {
                    fail("Failed in round #" + i);
                }
            }
            msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
            userAccumulators = msg.userAccumulators();
            if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                LOG.info("Passed initial check for sink task.");
            } else {
                fail("Wrong accumulator results when sink task begins execution.");
            }
            /* for sink task */
            for (int i = 1; i <= NUM_ITERATIONS; i++) {
                // receive message
                msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
                userAccumulators = msg.userAccumulators();
                LOG.info("{}", userAccumulators);
                if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    LOG.info("Passed round #" + i);
                } else {
                    fail("Failed in round #" + i);
                }
            }
            expectMsgClass(TIMEOUT, JobManagerMessages.JobResultSuccess.class);
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Accumulator(org.apache.flink.api.common.accumulators.Accumulator) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JavaTestKit(akka.testkit.JavaTestKit)

Example 32 with AkkaActorGateway

use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.

the class UtilsTest method testYarnFlinkResourceManagerJobManagerLostLeadership.

@Test
public void testYarnFlinkResourceManagerJobManagerLostLeadership() throws Exception {
    new JavaTestKit(system) {

        {
            final Deadline deadline = new FiniteDuration(3, TimeUnit.MINUTES).fromNow();
            Configuration flinkConfig = new Configuration();
            YarnConfiguration yarnConfig = new YarnConfiguration();
            TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
            String applicationMasterHostName = "localhost";
            String webInterfaceURL = "foobar";
            ContaineredTaskManagerParameters taskManagerParameters = new ContaineredTaskManagerParameters(1l, 1l, 1l, 1, new HashMap<String, String>());
            ContainerLaunchContext taskManagerLaunchContext = mock(ContainerLaunchContext.class);
            int yarnHeartbeatIntervalMillis = 1000;
            int maxFailedContainers = 10;
            int numInitialTaskManagers = 5;
            final YarnResourceManagerCallbackHandler callbackHandler = new YarnResourceManagerCallbackHandler();
            AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient = mock(AMRMClientAsync.class);
            NMClient nodeManagerClient = mock(NMClient.class);
            UUID leaderSessionID = UUID.randomUUID();
            final List<Container> containerList = new ArrayList<>();
            for (int i = 0; i < numInitialTaskManagers; i++) {
                containerList.add(new TestingContainer("container_" + i, "localhost"));
            }
            doAnswer(new Answer() {

                int counter = 0;

                @Override
                public Object answer(InvocationOnMock invocation) throws Throwable {
                    if (counter < containerList.size()) {
                        callbackHandler.onContainersAllocated(Collections.singletonList(containerList.get(counter++)));
                    }
                    return null;
                }
            }).when(resourceManagerClient).addContainerRequest(Matchers.any(AMRMClient.ContainerRequest.class));
            ActorRef resourceManager = null;
            ActorRef leader1;
            try {
                leader1 = system.actorOf(Props.create(TestingUtils.ForwardingActor.class, getRef(), Option.apply(leaderSessionID)));
                resourceManager = system.actorOf(Props.create(TestingYarnFlinkResourceManager.class, flinkConfig, yarnConfig, leaderRetrievalService, applicationMasterHostName, webInterfaceURL, taskManagerParameters, taskManagerLaunchContext, yarnHeartbeatIntervalMillis, maxFailedContainers, numInitialTaskManagers, callbackHandler, resourceManagerClient, nodeManagerClient));
                leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
                final AkkaActorGateway leader1Gateway = new AkkaActorGateway(leader1, leaderSessionID);
                final AkkaActorGateway resourceManagerGateway = new AkkaActorGateway(resourceManager, leaderSessionID);
                doAnswer(new Answer() {

                    @Override
                    public Object answer(InvocationOnMock invocation) throws Throwable {
                        Container container = (Container) invocation.getArguments()[0];
                        resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
                        return null;
                    }
                }).when(nodeManagerClient).startContainer(Matchers.any(Container.class), Matchers.any(ContainerLaunchContext.class));
                expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
                resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
                for (int i = 0; i < containerList.size(); i++) {
                    expectMsgClass(deadline.timeLeft(), Acknowledge.class);
                }
                Future<Object> taskManagerRegisteredFuture = resourceManagerGateway.ask(new NotifyWhenResourcesRegistered(numInitialTaskManagers), deadline.timeLeft());
                Await.ready(taskManagerRegisteredFuture, deadline.timeLeft());
                leaderRetrievalService.notifyListener(null, null);
                leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
                expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
                resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
                for (Container container : containerList) {
                    resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
                }
                for (int i = 0; i < containerList.size(); i++) {
                    expectMsgClass(deadline.timeLeft(), Acknowledge.class);
                }
                Future<Object> numberOfRegisteredResourcesFuture = resourceManagerGateway.ask(RequestNumberOfRegisteredResources.Instance, deadline.timeLeft());
                int numberOfRegisteredResources = (Integer) Await.result(numberOfRegisteredResourcesFuture, deadline.timeLeft());
                assertEquals(numInitialTaskManagers, numberOfRegisteredResources);
            } finally {
                if (resourceManager != null) {
                    resourceManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
                }
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ActorRef(akka.actor.ActorRef) ArrayList(java.util.ArrayList) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) Container(org.apache.hadoop.yarn.api.records.Container) TestingUtils(org.apache.flink.runtime.testingUtils.TestingUtils) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) RegisterResourceManagerSuccessful(org.apache.flink.runtime.clusterframework.messages.RegisterResourceManagerSuccessful) UUID(java.util.UUID) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) NotifyResourceStarted(org.apache.flink.runtime.clusterframework.messages.NotifyResourceStarted) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Answer(org.mockito.stubbing.Answer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) NMClient(org.apache.hadoop.yarn.client.api.NMClient) JavaTestKit(akka.testkit.JavaTestKit) NotifyWhenResourcesRegistered(org.apache.flink.yarn.messages.NotifyWhenResourcesRegistered) Test(org.junit.Test)

Example 33 with AkkaActorGateway

use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.

/**
	 * Tests that the JobManager logs failures during recovery properly.
	 *
	 * @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
    final Deadline testDeadline = TestTimeOut.fromNow();
    final String zooKeeperQuorum = ZooKeeper.getConnectString();
    final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    ActorSystem testActorSystem = null;
    try {
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Get the leader
        leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
        ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        JobManagerProcess nonLeadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
            nonLeadingJobManagerProcess = jobManagerProcess[1];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
            nonLeadingJobManagerProcess = jobManagerProcess[0];
        }
        // Blocking JobGraph
        JobVertex blockingVertex = new JobVertex("Blocking vertex");
        blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
        JobGraph jobGraph = new JobGraph(blockingVertex);
        // Submit the job in detached mode
        leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to be running
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
        // Remove all files
        FileUtils.deleteDirectory(FileStateBackendBasePath);
        // Kill the leader
        leadingJobManagerProcess.destroy();
        // Verify that the job manager logs the failed recovery. We can not
        // do more at this point. :(
        boolean success = false;
        while (testDeadline.hasTimeLeft()) {
            String output = nonLeadingJobManagerProcess.getProcessOutput();
            if (output != null) {
                if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
                    success = true;
                    break;
                }
            } else {
                log.warn("No process output available.");
            }
            Thread.sleep(500);
        }
        assertTrue("Did not find expected output in logs.", success);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testActorSystem != null) {
            testActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 34 with AkkaActorGateway

use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.

/**
	 * Tests that clients receive updates after recovery by a new leader.
	 */
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Test actor system
    ActorSystem testSystem = null;
    // JobManager setup. Start the job managers as separate processes in order to not run the
    // actors postStop, which cleans up all running jobs.
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Client test actor
        TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
        JobGraph jobGraph = createBlockingJobGraph();
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // The client
            AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            int numSlots = 0;
            while (numSlots == 0) {
                Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
                numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
            }
            // Submit the job in non-detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
            // Cancel the job
            leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        }
        // Wait for the execution result
        clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
        int jobSubmitSuccessMessages = 0;
        for (Object msg : clientRef.underlyingActor().getMessages()) {
            if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
                jobSubmitSuccessMessages++;
            }
        }
        // At least two submissions should be ack-ed (initial and recovery). This is quite
        // conservative, but it is still possible that these messages are overtaken by the
        // final message.
        assertEquals(2, jobSubmitSuccessMessages);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) Future(scala.concurrent.Future) Test(org.junit.Test)

Example 35 with AkkaActorGateway

use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.

the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.

@Test
public void testJobManagerProcessFailure() throws Exception {
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Setup
    // Test actor system
    ActorSystem testActorSystem;
    // Job managers
    final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
    // Task managers
    final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
    // Leader election service
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Coordination directory
        coordinateTempDir = createTempDirectory();
        // Job Managers
        Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
        // Start first process
        jmProcess[0] = new JobManagerProcess(0, config);
        jmProcess[0].startProcess();
        // Task manager configuration
        config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
        config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
            TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        }
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
        ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
        // Wait for all task managers to connect to the leading job manager
        JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        //start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        jmProcess[0].destroy();
        jmProcess[1] = new JobManagerProcess(1, config);
        jmProcess[1].startProcess();
        jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
        // we create the marker file which signals the program functions tasks that they can complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (JobManagerProcess p : jmProcess) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (tmActorSystem[i] != null) {
                tmActorSystem[i].shutdown();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (JobManagerProcess jmProces : jmProcess) {
            if (jmProces != null) {
                jmProces.destroy();
            }
        }
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Aggregations

AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)44 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)37 Test (org.junit.Test)35 ActorRef (akka.actor.ActorRef)33 Configuration (org.apache.flink.configuration.Configuration)30 JavaTestKit (akka.testkit.JavaTestKit)21 JobID (org.apache.flink.api.common.JobID)18 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)17 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)15 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)14 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)14 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)14 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)14 TaskManagerServicesConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration)14 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)12 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)12 SubmitTask (org.apache.flink.runtime.messages.TaskMessages.SubmitTask)12 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)12 IOException (java.io.IOException)11 FiniteDuration (scala.concurrent.duration.FiniteDuration)11