Search in sources :

Example 6 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.

/**
	 * Simple checkpointed streaming sum.
	 *
	 * <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
	 * returns it to the main thread via a static variable. We wait until some checkpoints are
	 * completed and sanity check that the sources recover with an updated state to make sure that
	 * this test actually tests something.
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
    // Config
    final int checkpointingInterval = 200;
    final int sequenceEnd = 5000;
    final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(Parallelism);
    env.enableCheckpointing(checkpointingInterval);
    env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
    ActorSystem testSystem = null;
    final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            // Submit the job in detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        CompletedCheckpointsLatch.await();
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Wait to finish
        FinalCountLatch.await();
        assertEquals(expectedSum, (long) FinalCount.get());
        for (int i = 0; i < Parallelism; i++) {
            assertNotEquals(0, RecoveredStates.get(i));
        }
    } catch (Throwable t) {
        // Reset all static state for test retries
        CompletedCheckpointsLatch = new CountDownLatch(2);
        RecoveredStates = new AtomicLongArray(Parallelism);
        FinalCountLatch = new CountDownLatch(1);
        FinalCount = new AtomicReference<>();
        LastElement = -1;
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 7 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class MesosApplicationMasterRunner method runPrivileged.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
    ActorSystem actorSystem = null;
    WebMonitor webMonitor = null;
    MesosArtifactServer artifactServer = null;
    ScheduledExecutorService futureExecutor = null;
    ExecutorService ioExecutor = null;
    MesosServices mesosServices = null;
    try {
        // ------- (1) load and parse / validate all configurations -------
        // Note that we use the "appMasterHostname" given by the system, to make sure
        // we use the hostnames consistently throughout akka.
        // for akka "localhost" and "localhost.localdomain" are different actors.
        final String appMasterHostname = InetAddress.getLocalHost().getHostName();
        // Mesos configuration
        final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
        // JM configuration
        int numberProcessors = Hardware.getNumberCPUCores();
        futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
        ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
        mesosServices = MesosServicesUtils.createMesosServices(config);
        // TM configuration
        final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
        LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
        LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
        // JM endpoint, which should be explicitly configured based on acquired net resources
        final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
        checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
        // ----------------- (2) start the actor system -------------------
        // try to start the actor system, JobManager and JobManager actor system
        // using the configured address and ports
        actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
        Address address = AkkaUtils.getAddress(actorSystem);
        final String akkaHostname = address.host().get();
        final int akkaPort = (Integer) address.port().get();
        LOG.info("Actor system bound to hostname {}.", akkaHostname);
        // try to start the artifact server
        LOG.debug("Starting Artifact Server");
        final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
        final String artifactServerPrefix = UUID.randomUUID().toString();
        artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
        // ----------------- (3) Generate the configuration for the TaskManagers -------------------
        // generate a container spec which conveys the artifacts/vars needed to launch a TM
        ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
        // propagate the AM dynamic configuration to the TM
        taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
        // propagate newly-generated configuration elements
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
        taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
        // apply the overlays
        applyOverlays(config, taskManagerContainerSpec);
        // configure the artifact server to serve the specified artifacts
        configureArtifactServer(artifactServer, taskManagerContainerSpec);
        // ----------------- (4) start the actors -------------------
        // 1) JobManager & Archive (in non-HA case, the leader service takes this)
        // 2) Web Monitor (we need its port to register)
        // 3) Resource Master for Mesos
        // 4) Process reapers for the JobManager and Resource Master
        // 1: the JobManager
        LOG.debug("Starting JobManager actor");
        // we start the JobManager with its standard name
        ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
        // 2: the web monitor
        LOG.debug("Starting Web Frontend");
        webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
        if (webMonitor != null) {
            final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
            mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
        }
        // 3: Flink's Mesos ResourceManager
        LOG.debug("Starting Mesos Flink Resource Manager");
        // create the worker store to persist task information across restarts
        MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
        // we need the leader retrieval service here to be informed of new
        // leader session IDs, even though there can be only one leader ever
        LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
        ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
        // 4: Process reapers
        // The process reapers ensure that upon unexpected actor death, the process exits
        // and does not stay lingering around unresponsive
        LOG.debug("Starting process reapers for JobManager");
        actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
        actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("Mesos JobManager initialization failed", t);
        if (webMonitor != null) {
            try {
                webMonitor.stop();
            } catch (Throwable ignored) {
                LOG.warn("Failed to stop the web frontend", ignored);
            }
        }
        if (artifactServer != null) {
            try {
                artifactServer.stop();
            } catch (Throwable ignored) {
                LOG.error("Failed to stop the artifact server", ignored);
            }
        }
        if (actorSystem != null) {
            try {
                actorSystem.shutdown();
            } catch (Throwable tt) {
                LOG.error("Error shutting down actor system", tt);
            }
        }
        if (futureExecutor != null) {
            try {
                futureExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down future executor", tt);
            }
        }
        if (ioExecutor != null) {
            try {
                ioExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down io executor", tt);
            }
        }
        if (mesosServices != null) {
            try {
                mesosServices.close(false);
            } catch (Throwable tt) {
                LOG.error("Error closing the mesos services.", tt);
            }
        }
        return INIT_ERROR_EXIT_CODE;
    }
    // everything started, we can wait until all is done or the process is killed
    LOG.info("Mesos JobManager started");
    // wait until everything is done
    actorSystem.awaitTermination();
    // if we get here, everything work out jolly all right, and we even exited smoothly
    if (webMonitor != null) {
        try {
            webMonitor.stop();
        } catch (Throwable t) {
            LOG.error("Failed to stop the web frontend", t);
        }
    }
    try {
        artifactServer.stop();
    } catch (Throwable t) {
        LOG.error("Failed to stop the artifact server", t);
    }
    org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
    try {
        mesosServices.close(true);
    } catch (Throwable t) {
        LOG.error("Failed to clean up and close MesosServices.", t);
    }
    return 0;
}
Also used : ActorSystem(akka.actor.ActorSystem) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) InetAddress(java.net.InetAddress) Address(akka.actor.Address) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ProcessReaper(org.apache.flink.runtime.process.ProcessReaper) ActorRef(akka.actor.ActorRef) ContainerSpecification(org.apache.flink.runtime.clusterframework.ContainerSpecification) Props(akka.actor.Props) URL(java.net.URL) ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) MesosArtifactServer(org.apache.flink.mesos.util.MesosArtifactServer) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) MesosWorkerStore(org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore) MesosServices(org.apache.flink.mesos.runtime.clusterframework.services.MesosServices)

Example 8 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.

/**
	 * Tests the metric registry life cycle on JobManager re-connects.
	 */
@Test
public void testMetricRegistryLifeCycle() throws Exception {
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        // ================================================================
        // Start JobManager
        // ================================================================
        final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        // ================================================================
        // Start TaskManager
        // ================================================================
        final Configuration config = new Configuration();
        final ResourceID tmResourceID = ResourceID.generate();
        TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
        TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
        TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
        final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
        // create the task manager
        final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        // wait for the TM to be registered
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                        // trigger re-registration of TM; this should include a disconnect from the current JM
                        taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
                        // wait for re-registration to be completed
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // verify that the registry was not shutdown due to the disconnect
        Assert.assertFalse(tmRegistry.isShutdown());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) ActorRef(akka.actor.ActorRef) TaskManagerServices(org.apache.flink.runtime.taskexecutor.TaskManagerServices) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobManager(org.apache.flink.runtime.jobmanager.JobManager) Props(akka.actor.Props) TaskManagerMessages(org.apache.flink.runtime.messages.TaskManagerMessages) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 9 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class YARNHighAvailabilityITCase method testMultipleAMKill.

/**
	 * Tests that the application master can be killed multiple times and that the surviving
	 * TaskManager successfully reconnects to the newly started JobManager.
	 * @throws Exception
	 */
@Test
public void testMultipleAMKill() throws Exception {
    final int numberKillingAttempts = numberApplicationAttempts - 1;
    TestingYarnClusterDescriptor flinkYarnClient = new TestingYarnClusterDescriptor();
    Assert.assertNotNull("unable to get yarn client", flinkYarnClient);
    flinkYarnClient.setTaskManagerCount(1);
    flinkYarnClient.setJobManagerMemory(768);
    flinkYarnClient.setTaskManagerMemory(1024);
    flinkYarnClient.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
    flinkYarnClient.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
    String confDirPath = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR);
    flinkYarnClient.setConfigurationDirectory(confDirPath);
    String fsStateHandlePath = temp.getRoot().getPath();
    // load the configuration
    File configDirectory = new File(confDirPath);
    GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
    flinkYarnClient.setFlinkConfiguration(GlobalConfiguration.loadConfiguration());
    flinkYarnClient.setDynamicPropertiesEncoded("recovery.mode=zookeeper@@recovery.zookeeper.quorum=" + zkServer.getConnectString() + "@@yarn.application-attempts=" + numberApplicationAttempts + "@@" + CoreOptions.STATE_BACKEND + "=FILESYSTEM" + "@@" + FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY + "=" + fsStateHandlePath + "/checkpoints" + "@@" + HighAvailabilityOptions.HA_STORAGE_PATH.key() + "=" + fsStateHandlePath + "/recovery");
    flinkYarnClient.setConfigurationFilePath(new Path(confDirPath + File.separator + "flink-conf.yaml"));
    ClusterClient yarnCluster = null;
    final FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
    try {
        yarnCluster = flinkYarnClient.deploy();
        final Configuration config = yarnCluster.getFlinkConfiguration();
        new JavaTestKit(actorSystem) {

            {
                for (int attempt = 0; attempt < numberKillingAttempts; attempt++) {
                    new Within(timeout) {

                        @Override
                        protected void run() {
                            try {
                                LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
                                ActorGateway gateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
                                ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway.leaderSessionID());
                                gateway.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
                                expectMsgEquals(Acknowledge.get());
                                gateway.tell(PoisonPill.getInstance());
                            } catch (Exception e) {
                                throw new AssertionError("Could not complete test.", e);
                            }
                        }
                    };
                }
                new Within(timeout) {

                    @Override
                    protected void run() {
                        try {
                            LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
                            ActorGateway gateway2 = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
                            ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway2.leaderSessionID());
                            gateway2.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
                            expectMsgEquals(Acknowledge.get());
                        } catch (Exception e) {
                            throw new AssertionError("Could not complete test.", e);
                        }
                    }
                };
            }
        };
    } finally {
        if (yarnCluster != null) {
            yarnCluster.shutdown();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) FiniteDuration(scala.concurrent.duration.FiniteDuration) ClusterClient(org.apache.flink.client.program.ClusterClient) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) File(java.io.File) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 10 with LeaderRetrievalService

use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.

the class SingleLeaderElectionServiceTest method testShutdown.

@Test
public void testShutdown() throws Exception {
    final UUID uuid = UUID.randomUUID();
    final SingleLeaderElectionService service = new SingleLeaderElectionService(executor, uuid);
    // create a leader contender and let it grab leadership
    final LeaderContender contender = mockContender(service);
    service.start(contender);
    verify(contender, times(1)).grantLeadership(uuid);
    // some leader listeners
    final LeaderRetrievalListener listener1 = mock(LeaderRetrievalListener.class);
    final LeaderRetrievalListener listener2 = mock(LeaderRetrievalListener.class);
    LeaderRetrievalService listenerService1 = service.createLeaderRetrievalService();
    LeaderRetrievalService listenerService2 = service.createLeaderRetrievalService();
    listenerService1.start(listener1);
    listenerService2.start(listener2);
    // one listener stops
    listenerService1.stop();
    // shut down the service
    service.shutdown();
    // the leader contender and running listener should get error notifications
    verify(contender, times(1)).handleError(any(Exception.class));
    verify(listener2, times(1)).handleError(any(Exception.class));
    // the stopped listener gets no notification
    verify(listener1, times(0)).handleError(any(Exception.class));
    // should not be possible to start again after shutdown
    try {
        service.start(contender);
        fail("should fail with an exception");
    } catch (IllegalStateException e) {
    // expected
    }
    // no additional leadership grant
    verify(contender, times(1)).grantLeadership(any(UUID.class));
}
Also used : LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) LeaderContender(org.apache.flink.runtime.leaderelection.LeaderContender) LeaderRetrievalListener(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener) UUID(java.util.UUID) Test(org.junit.Test)

Aggregations

LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)27 Configuration (org.apache.flink.configuration.Configuration)18 Test (org.junit.Test)16 ActorSystem (akka.actor.ActorSystem)11 ActorRef (akka.actor.ActorRef)10 UUID (java.util.UUID)9 Deadline (scala.concurrent.duration.Deadline)8 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)7 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)6 FiniteDuration (scala.concurrent.duration.FiniteDuration)6 File (java.io.File)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)5 Props (akka.actor.Props)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)4 JobManagerProcess (org.apache.flink.runtime.testutils.JobManagerProcess)4 Some (scala.Some)4 JavaTestKit (akka.testkit.JavaTestKit)3 IOException (java.io.IOException)3