Search in sources :

Example 11 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testJobPersistencyWhenJobManagerShutdown.

// ---------------------------------------------------------------------------------------------
/**
	 * Tests that the HA job is not cleaned up when the jobmanager is stopped.
	 */
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        // Set restart strategy to guard against shut down races.
        // If the TM fails before the JM, it might happen that the
        // Job is failed, leading to state removal.
        ExecutionConfig ec = new ExecutionConfig();
        ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
        jobGraph.setExecutionConfig(ec);
        ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
        // Submit the job
        jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to start
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, jobManager, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // verify that the persisted job data has not been removed from ZooKeeper when the JM has
    // been shutdown
    verifyRecoveryState(config);
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Example 12 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testSubmitJobToNonLeader.

/**
	 * Tests that submissions to non-leaders are handled.
	 */
@Test
public void testSubmitJobToNonLeader() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        List<ActorRef> bothJobManagers = flink.getJobManagersAsJava();
        ActorGateway leadingJobManager = flink.getLeaderGateway(deadline.timeLeft());
        ActorGateway nonLeadingJobManager;
        if (bothJobManagers.get(0).equals(leadingJobManager.actor())) {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(1), null);
        } else {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(0), null);
        }
        log.info("Leading job manager: " + leadingJobManager);
        log.info("Non-leading job manager: " + nonLeadingJobManager);
        // Submit the job
        nonLeadingJobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        log.info("Submitted job graph to " + nonLeadingJobManager);
        // Wait for the job to start. We are asking the *leading** JM here although we've
        // submitted the job to the non-leading JM. This is the behaviour under test.
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leadingJobManager, deadline.timeLeft());
        log.info("Wait that the non-leader removes the submitted job.");
        // Make sure that the **non-leading** JM has actually removed the job graph from its
        // local state.
        boolean success = false;
        while (!success && deadline.hasTimeLeft()) {
            JobStatusResponse jobStatusResponse = JobManagerActorTestUtils.requestJobStatus(jobGraph.getJobID(), nonLeadingJobManager, deadline.timeLeft());
            if (jobStatusResponse instanceof JobManagerMessages.JobNotFound) {
                success = true;
            } else {
                log.info(((JobManagerMessages.CurrentJobStatus) jobStatusResponse).status().toString());
                Thread.sleep(100);
            }
        }
        if (!success) {
            fail("Non-leading JM was still holding reference to the job graph.");
        }
        Future<Object> jobRemoved = leadingJobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        leadingJobManager.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        Await.ready(jobRemoved, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // Verify that everything is clean
    verifyCleanRecoveryState(config);
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobStatusResponse(org.apache.flink.runtime.messages.JobManagerMessages.JobStatusResponse) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Example 13 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class ZooKeeperLeaderElectionITCase method testTaskManagerRegistrationAtReelectedLeader.

/**
	 * Tests that the TaskManagers successfully register at the new leader once the old leader
	 * is terminated.
	 */
@Test
public void testTaskManagerRegistrationAtReelectedLeader() throws Exception {
    File rootFolder = tempFolder.getRoot();
    Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
    int numJMs = 10;
    int numTMs = 3;
    configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
    configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
    TestingCluster cluster = new TestingCluster(configuration);
    try {
        cluster.start();
        for (int i = 0; i < numJMs; i++) {
            ActorGateway leadingJM = cluster.getLeaderGateway(timeout);
            cluster.waitForTaskManagersToBeRegisteredAtJobManager(leadingJM.actor());
            Future<Object> registeredTMs = leadingJM.ask(JobManagerMessages.getRequestNumberRegisteredTaskManager(), timeout);
            int numRegisteredTMs = (Integer) Await.result(registeredTMs, timeout);
            assertEquals(numTMs, numRegisteredTMs);
            cluster.clearLeader();
            leadingJM.tell(PoisonPill.getInstance());
        }
    } finally {
        cluster.stop();
    }
}
Also used : TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) Configuration(org.apache.flink.configuration.Configuration) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) File(java.io.File) Test(org.junit.Test)

Example 14 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class TaskManagerRegistrationTest method testShutdownAfterRegistrationDurationExpired.

/**
	 * Tests that the TaskManager shuts down when it cannot register at the
	 * JobManager within the given maximum duration.
	 *
	 * Unfortunately, this test does not give good error messages.
	 * (I have not figured out how to get any better message out of the
	 * Akka TestKit than "ask timeout exception".)
	 *
	 * Anyways: An "ask timeout exception" here means that the TaskManager
	 * did not shut down after its registration timeout expired.
	 */
@Test
public void testShutdownAfterRegistrationDurationExpired() {
    new JavaTestKit(actorSystem) {

        {
            ActorGateway taskManager = null;
            try {
                // registration timeout of 1 second
                Configuration tmConfig = new Configuration();
                tmConfig.setString(ConfigConstants.TASK_MANAGER_MAX_REGISTRATION_DURATION, "500 ms");
                // start the taskManager actor
                taskManager = createTaskManager(actorSystem, JobManager.getLocalJobManagerAkkaURL(Option.<String>empty()), tmConfig, true, false);
                // make sure it terminates in time, since it cannot register at a JobManager
                watch(taskManager.actor());
                final ActorGateway tm = taskManager;
                new Within(timeout) {

                    @Override
                    protected void run() {
                        expectTerminated(tm.actor());
                    }
                };
            } catch (Throwable e) {
                e.printStackTrace();
                fail(e.getMessage());
            } finally {
                stopActor(taskManager);
            }
        }
    };
}
Also used : Configuration(org.apache.flink.configuration.Configuration) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 15 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class TaskManagerRegistrationTest method testTaskManagerNoExcessiveRegistrationMessages.

/**
	 * Tests that the TaskManager does not send an excessive amount of registration messages to
	 * the job manager if its registration was rejected.
	 */
@Test
public void testTaskManagerNoExcessiveRegistrationMessages() throws Exception {
    new JavaTestKit(actorSystem) {

        {
            ActorGateway jm = null;
            ActorGateway taskManager = null;
            try {
                FiniteDuration timeout = new FiniteDuration(5, TimeUnit.SECONDS);
                jm = TestingUtils.createForwardingActor(actorSystem, getTestActor(), Option.<String>empty());
                final ActorGateway jmGateway = jm;
                long refusedRegistrationPause = 500;
                long initialRegistrationPause = 100;
                long maxDelay = 30000;
                Configuration tmConfig = new Configuration(config);
                tmConfig.setString(ConfigConstants.TASK_MANAGER_REFUSED_REGISTRATION_PAUSE, refusedRegistrationPause + " ms");
                tmConfig.setString(ConfigConstants.TASK_MANAGER_INITIAL_REGISTRATION_PAUSE, initialRegistrationPause + " ms");
                // we make the test actor (the test kit) the JobManager to intercept
                // the messages
                taskManager = createTaskManager(actorSystem, jmGateway, tmConfig, true, false);
                final ActorGateway taskManagerGateway = taskManager;
                final Deadline deadline = timeout.fromNow();
                try {
                    while (deadline.hasTimeLeft()) {
                        // the TaskManager should try to register
                        expectMsgClass(deadline.timeLeft(), RegisterTaskManager.class);
                        // we decline the registration
                        taskManagerGateway.tell(new RefuseRegistration(new Exception("test reason")), jmGateway);
                    }
                } catch (AssertionError error) {
                // ignore since it simply means that we have used up all our time
                }
                RegisterTaskManager[] registerTaskManagerMessages = new ReceiveWhile<RegisterTaskManager>(RegisterTaskManager.class, timeout) {

                    @Override
                    protected RegisterTaskManager match(Object msg) throws Exception {
                        if (msg instanceof RegisterTaskManager) {
                            return (RegisterTaskManager) msg;
                        } else {
                            throw noMatch();
                        }
                    }
                }.get();
                int maxExponent = (int) Math.floor(Math.log(((double) maxDelay / initialRegistrationPause + 1)) / Math.log(2));
                int exponent = (int) Math.ceil(Math.log(((double) timeout.toMillis() / initialRegistrationPause + 1)) / Math.log(2));
                int exp = Math.min(maxExponent, exponent);
                long difference = timeout.toMillis() - (initialRegistrationPause * (1 << exp));
                int numberRegisterTaskManagerMessages = exp;
                if (difference > 0) {
                    numberRegisterTaskManagerMessages += Math.ceil((double) difference / maxDelay);
                }
                int maxExpectedNumberOfRegisterTaskManagerMessages = numberRegisterTaskManagerMessages * 2;
                assertTrue("The number of RegisterTaskManager messages #" + registerTaskManagerMessages.length + " should be less than #" + maxExpectedNumberOfRegisterTaskManagerMessages, registerTaskManagerMessages.length <= maxExpectedNumberOfRegisterTaskManagerMessages);
            } finally {
                stopActor(taskManager);
                stopActor(jm);
            }
        }
    };
}
Also used : RegisterTaskManager(org.apache.flink.runtime.messages.RegistrationMessages.RegisterTaskManager) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) RefuseRegistration(org.apache.flink.runtime.messages.RegistrationMessages.RefuseRegistration) InvalidActorNameException(akka.actor.InvalidActorNameException) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Aggregations

ActorGateway (org.apache.flink.runtime.instance.ActorGateway)115 Test (org.junit.Test)91 JobID (org.apache.flink.api.common.JobID)47 Configuration (org.apache.flink.configuration.Configuration)46 FiniteDuration (scala.concurrent.duration.FiniteDuration)45 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)41 JavaTestKit (akka.testkit.JavaTestKit)34 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)32 IOException (java.io.IOException)31 ActorRef (akka.actor.ActorRef)30 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)27 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)27 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)20 Deadline (scala.concurrent.duration.Deadline)20 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)17 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)17 TaskManagerServicesConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration)16 File (java.io.File)15 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)15 TriggerSavepoint (org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint)15