Search in sources :

Example 6 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class AkkaRpcService method stopServer.

@Override
public void stopServer(RpcGateway selfGateway) {
    if (selfGateway instanceof AkkaGateway) {
        AkkaGateway akkaClient = (AkkaGateway) selfGateway;
        boolean fromThisService;
        synchronized (lock) {
            if (stopped) {
                return;
            } else {
                fromThisService = actors.remove(akkaClient.getRpcEndpoint());
            }
        }
        if (fromThisService) {
            ActorRef selfActorRef = akkaClient.getRpcEndpoint();
            LOG.info("Stopping RPC endpoint {}.", selfActorRef.path());
            selfActorRef.tell(PoisonPill.getInstance(), ActorRef.noSender());
        } else {
            LOG.debug("RPC endpoint {} already stopped or from different RPC service");
        }
    }
}
Also used : ActorRef(akka.actor.ActorRef)

Example 7 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.

/**
	 * Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
	 * loses its leadership. Furthermore, it tests that the job manager can recover the job from
	 * the SubmittedJobGraphStore and checkpoint state is recovered as well.
	 */
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    int slots = 2;
    ActorRef archive = null;
    ActorRef jobManager = null;
    ActorRef taskManager = null;
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
    flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
        MyCheckpointStore checkpointStore = new MyCheckpointStore();
        CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
        CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
        TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
        InstanceManager instanceManager = new InstanceManager();
        instanceManager.addInstanceListener(scheduler);
        archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
        Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
        jobManager = system.actorOf(jobManagerProps);
        ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
        taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
        ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
        Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
        Await.ready(tmAlive, deadline.timeLeft());
        JobVertex sourceJobVertex = new JobVertex("Source");
        sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceJobVertex.setParallelism(slots);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
        List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
        jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
        BlockingStatefulInvokable.initializeStaticHelpers(slots);
        Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
        Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
        // tell jobManager that he's the leader
        myLeaderElectionService.isLeader(leaderSessionID);
        // tell taskManager who's the leader
        myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
        Await.ready(isLeader, deadline.timeLeft());
        Await.ready(isConnectedToJobManager, deadline.timeLeft());
        // submit blocking job
        Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
        Await.ready(jobSubmitted, deadline.timeLeft());
        // Wait for some checkpoints to complete
        BlockingStatefulInvokable.awaitCompletedCheckpoints();
        Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        // Revoke leadership
        myLeaderElectionService.notLeader();
        // check that the job gets removed from the JobManager
        Await.ready(jobRemoved, deadline.timeLeft());
        // but stays in the submitted job graph store
        assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
        // Make JobManager again a leader
        myLeaderElectionService.isLeader(newLeaderSessionID);
        // tell the TaskManager about it
        myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
        // wait that the job is recovered and reaches state RUNNING
        Await.ready(jobRunning, deadline.timeLeft());
        Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        BlockingInvokable.unblock();
        // wait til the job has finished
        Await.ready(jobFinished, deadline.timeLeft());
        // check that the job has been removed from the submitted job graph store
        assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        // Check that state has been recovered
        long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
        for (long state : recoveredStates) {
            boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
            assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
        }
    } finally {
        if (archive != null) {
            archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) ActorRef(akka.actor.ActorRef) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Props(akka.actor.Props) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TestingTaskManager(org.apache.flink.runtime.testingUtils.TestingTaskManager) BlobServer(org.apache.flink.runtime.blob.BlobServer) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) UUID(java.util.UUID) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Deadline(scala.concurrent.duration.Deadline) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 8 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobManagerTest method testKvStateMessages.

/**
	 * Tests that the JobManager handles {@link org.apache.flink.runtime.query.KvStateMessage}
	 * instances as expected.
	 */
@Test
public void testKvStateMessages() throws Exception {
    Deadline deadline = new FiniteDuration(100, TimeUnit.SECONDS).fromNow();
    Configuration config = new Configuration();
    config.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100ms");
    UUID leaderSessionId = null;
    ActorGateway jobManager = new AkkaActorGateway(JobManager.startJobManagerActors(config, system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), TestingJobManager.class, MemoryArchivist.class)._1(), leaderSessionId);
    LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(system, jobManager.actor()));
    Configuration tmConfig = new Configuration();
    tmConfig.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
    tmConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 8);
    ActorRef taskManager = TaskManager.startTaskManagerComponentsAndActor(tmConfig, ResourceID.generate(), system, "localhost", scala.Option.<String>empty(), scala.Option.apply(leaderRetrievalService), true, TestingTaskManager.class);
    Future<Object> registrationFuture = jobManager.ask(new NotifyWhenAtLeastNumTaskManagerAreRegistered(1), deadline.timeLeft());
    Await.ready(registrationFuture, deadline.timeLeft());
    //
    // Location lookup
    //
    LookupKvStateLocation lookupNonExistingJob = new LookupKvStateLocation(new JobID(), "any-name");
    Future<KvStateLocation> lookupFuture = jobManager.ask(lookupNonExistingJob, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
    try {
        Await.result(lookupFuture, deadline.timeLeft());
        fail("Did not throw expected Exception");
    } catch (IllegalStateException ignored) {
    // Expected
    }
    JobGraph jobGraph = new JobGraph("croissant");
    JobVertex jobVertex1 = new JobVertex("cappuccino");
    jobVertex1.setParallelism(4);
    jobVertex1.setMaxParallelism(16);
    jobVertex1.setInvokableClass(BlockingNoOpInvokable.class);
    JobVertex jobVertex2 = new JobVertex("americano");
    jobVertex2.setParallelism(4);
    jobVertex2.setMaxParallelism(16);
    jobVertex2.setInvokableClass(BlockingNoOpInvokable.class);
    jobGraph.addVertex(jobVertex1);
    jobGraph.addVertex(jobVertex2);
    Future<JobSubmitSuccess> submitFuture = jobManager.ask(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<JobSubmitSuccess>apply(JobSubmitSuccess.class));
    Await.result(submitFuture, deadline.timeLeft());
    Object lookupUnknownRegistrationName = new LookupKvStateLocation(jobGraph.getJobID(), "unknown");
    lookupFuture = jobManager.ask(lookupUnknownRegistrationName, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
    try {
        Await.result(lookupFuture, deadline.timeLeft());
        fail("Did not throw expected Exception");
    } catch (UnknownKvStateLocation ignored) {
    // Expected
    }
    //
    // Registration
    //
    NotifyKvStateRegistered registerNonExistingJob = new NotifyKvStateRegistered(new JobID(), new JobVertexID(), new KeyGroupRange(0, 0), "any-name", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1233));
    jobManager.tell(registerNonExistingJob);
    LookupKvStateLocation lookupAfterRegistration = new LookupKvStateLocation(registerNonExistingJob.getJobId(), registerNonExistingJob.getRegistrationName());
    lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
    try {
        Await.result(lookupFuture, deadline.timeLeft());
        fail("Did not throw expected Exception");
    } catch (IllegalStateException ignored) {
    // Expected
    }
    NotifyKvStateRegistered registerForExistingJob = new NotifyKvStateRegistered(jobGraph.getJobID(), jobVertex1.getID(), new KeyGroupRange(0, 0), "register-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
    jobManager.tell(registerForExistingJob);
    lookupAfterRegistration = new LookupKvStateLocation(registerForExistingJob.getJobId(), registerForExistingJob.getRegistrationName());
    lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
    KvStateLocation location = Await.result(lookupFuture, deadline.timeLeft());
    assertNotNull(location);
    assertEquals(jobGraph.getJobID(), location.getJobId());
    assertEquals(jobVertex1.getID(), location.getJobVertexId());
    assertEquals(jobVertex1.getMaxParallelism(), location.getNumKeyGroups());
    assertEquals(1, location.getNumRegisteredKeyGroups());
    KeyGroupRange keyGroupRange = registerForExistingJob.getKeyGroupRange();
    assertEquals(1, keyGroupRange.getNumberOfKeyGroups());
    assertEquals(registerForExistingJob.getKvStateId(), location.getKvStateID(keyGroupRange.getStartKeyGroup()));
    assertEquals(registerForExistingJob.getKvStateServerAddress(), location.getKvStateServerAddress(keyGroupRange.getStartKeyGroup()));
    //
    // Unregistration
    //
    NotifyKvStateUnregistered unregister = new NotifyKvStateUnregistered(registerForExistingJob.getJobId(), registerForExistingJob.getJobVertexId(), registerForExistingJob.getKeyGroupRange(), registerForExistingJob.getRegistrationName());
    jobManager.tell(unregister);
    lookupFuture = jobManager.ask(lookupAfterRegistration, deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<KvStateLocation>apply(KvStateLocation.class));
    try {
        Await.result(lookupFuture, deadline.timeLeft());
        fail("Did not throw expected Exception");
    } catch (UnknownKvStateLocation ignored) {
    // Expected
    }
    //
    // Duplicate registration fails task
    //
    NotifyKvStateRegistered register = new NotifyKvStateRegistered(jobGraph.getJobID(), jobVertex1.getID(), new KeyGroupRange(0, 0), "duplicate-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
    NotifyKvStateRegistered duplicate = new NotifyKvStateRegistered(jobGraph.getJobID(), // <--- different operator, but...
    jobVertex2.getID(), new KeyGroupRange(0, 0), // ...same name
    "duplicate-me", new KvStateID(), new KvStateServerAddress(InetAddress.getLocalHost(), 1293));
    Future<TestingJobManagerMessages.JobStatusIs> failedFuture = jobManager.ask(new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.FAILED), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<JobStatusIs>apply(JobStatusIs.class));
    jobManager.tell(register);
    jobManager.tell(duplicate);
    // Wait for failure
    JobStatusIs jobStatus = Await.result(failedFuture, deadline.timeLeft());
    assertEquals(JobStatus.FAILED, jobStatus.state());
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) UnknownKvStateLocation(org.apache.flink.runtime.query.UnknownKvStateLocation) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) KvStateServerAddress(org.apache.flink.runtime.query.KvStateServerAddress) LookupKvStateLocation(org.apache.flink.runtime.query.KvStateMessage.LookupKvStateLocation) KvStateLocation(org.apache.flink.runtime.query.KvStateLocation) UnknownKvStateLocation(org.apache.flink.runtime.query.UnknownKvStateLocation) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobSubmitSuccess(org.apache.flink.runtime.messages.JobManagerMessages.JobSubmitSuccess) KvStateID(org.apache.flink.runtime.query.KvStateID) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) NotifyKvStateRegistered(org.apache.flink.runtime.query.KvStateMessage.NotifyKvStateRegistered) NotifyKvStateUnregistered(org.apache.flink.runtime.query.KvStateMessage.NotifyKvStateUnregistered) JobStatusIs(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.JobStatusIs) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) LookupKvStateLocation(org.apache.flink.runtime.query.KvStateMessage.LookupKvStateLocation) NotifyWhenAtLeastNumTaskManagerAreRegistered(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered) JobID(org.apache.flink.api.common.JobID) NotifyWhenJobStatus(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobStatus) Test(org.junit.Test)

Example 9 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobManagerLeaderElectionTest method testLeaderElection.

/**
	 * Tests that a single JobManager is elected as the leader by ZooKeeper.
	 */
@Test
public void testLeaderElection() throws Exception {
    final Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(testingServer.getConnectString(), tempFolder.getRoot().getPath());
    ActorRef jm = null;
    try {
        Props jmProps = createJobManagerProps(configuration);
        jm = actorSystem.actorOf(jmProps);
        Future<Object> leaderFuture = Patterns.ask(jm, TestingJobManagerMessages.getNotifyWhenLeader(), timeout);
        Await.ready(leaderFuture, duration);
    } finally {
        TestingUtils.stopActor(jm);
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Props(akka.actor.Props) Test(org.junit.Test)

Example 10 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class MesosApplicationMasterRunner method runPrivileged.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
    ActorSystem actorSystem = null;
    WebMonitor webMonitor = null;
    MesosArtifactServer artifactServer = null;
    ScheduledExecutorService futureExecutor = null;
    ExecutorService ioExecutor = null;
    MesosServices mesosServices = null;
    try {
        // ------- (1) load and parse / validate all configurations -------
        // Note that we use the "appMasterHostname" given by the system, to make sure
        // we use the hostnames consistently throughout akka.
        // for akka "localhost" and "localhost.localdomain" are different actors.
        final String appMasterHostname = InetAddress.getLocalHost().getHostName();
        // Mesos configuration
        final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
        // JM configuration
        int numberProcessors = Hardware.getNumberCPUCores();
        futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
        ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
        mesosServices = MesosServicesUtils.createMesosServices(config);
        // TM configuration
        final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
        LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
        LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
        // JM endpoint, which should be explicitly configured based on acquired net resources
        final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
        checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
        // ----------------- (2) start the actor system -------------------
        // try to start the actor system, JobManager and JobManager actor system
        // using the configured address and ports
        actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
        Address address = AkkaUtils.getAddress(actorSystem);
        final String akkaHostname = address.host().get();
        final int akkaPort = (Integer) address.port().get();
        LOG.info("Actor system bound to hostname {}.", akkaHostname);
        // try to start the artifact server
        LOG.debug("Starting Artifact Server");
        final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
        final String artifactServerPrefix = UUID.randomUUID().toString();
        artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
        // ----------------- (3) Generate the configuration for the TaskManagers -------------------
        // generate a container spec which conveys the artifacts/vars needed to launch a TM
        ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
        // propagate the AM dynamic configuration to the TM
        taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
        // propagate newly-generated configuration elements
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
        taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
        // apply the overlays
        applyOverlays(config, taskManagerContainerSpec);
        // configure the artifact server to serve the specified artifacts
        configureArtifactServer(artifactServer, taskManagerContainerSpec);
        // ----------------- (4) start the actors -------------------
        // 1) JobManager & Archive (in non-HA case, the leader service takes this)
        // 2) Web Monitor (we need its port to register)
        // 3) Resource Master for Mesos
        // 4) Process reapers for the JobManager and Resource Master
        // 1: the JobManager
        LOG.debug("Starting JobManager actor");
        // we start the JobManager with its standard name
        ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
        // 2: the web monitor
        LOG.debug("Starting Web Frontend");
        webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
        if (webMonitor != null) {
            final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
            mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
        }
        // 3: Flink's Mesos ResourceManager
        LOG.debug("Starting Mesos Flink Resource Manager");
        // create the worker store to persist task information across restarts
        MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
        // we need the leader retrieval service here to be informed of new
        // leader session IDs, even though there can be only one leader ever
        LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
        ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
        // 4: Process reapers
        // The process reapers ensure that upon unexpected actor death, the process exits
        // and does not stay lingering around unresponsive
        LOG.debug("Starting process reapers for JobManager");
        actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
        actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("Mesos JobManager initialization failed", t);
        if (webMonitor != null) {
            try {
                webMonitor.stop();
            } catch (Throwable ignored) {
                LOG.warn("Failed to stop the web frontend", ignored);
            }
        }
        if (artifactServer != null) {
            try {
                artifactServer.stop();
            } catch (Throwable ignored) {
                LOG.error("Failed to stop the artifact server", ignored);
            }
        }
        if (actorSystem != null) {
            try {
                actorSystem.shutdown();
            } catch (Throwable tt) {
                LOG.error("Error shutting down actor system", tt);
            }
        }
        if (futureExecutor != null) {
            try {
                futureExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down future executor", tt);
            }
        }
        if (ioExecutor != null) {
            try {
                ioExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down io executor", tt);
            }
        }
        if (mesosServices != null) {
            try {
                mesosServices.close(false);
            } catch (Throwable tt) {
                LOG.error("Error closing the mesos services.", tt);
            }
        }
        return INIT_ERROR_EXIT_CODE;
    }
    // everything started, we can wait until all is done or the process is killed
    LOG.info("Mesos JobManager started");
    // wait until everything is done
    actorSystem.awaitTermination();
    // if we get here, everything work out jolly all right, and we even exited smoothly
    if (webMonitor != null) {
        try {
            webMonitor.stop();
        } catch (Throwable t) {
            LOG.error("Failed to stop the web frontend", t);
        }
    }
    try {
        artifactServer.stop();
    } catch (Throwable t) {
        LOG.error("Failed to stop the artifact server", t);
    }
    org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
    try {
        mesosServices.close(true);
    } catch (Throwable t) {
        LOG.error("Failed to clean up and close MesosServices.", t);
    }
    return 0;
}
Also used : ActorSystem(akka.actor.ActorSystem) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) InetAddress(java.net.InetAddress) Address(akka.actor.Address) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ProcessReaper(org.apache.flink.runtime.process.ProcessReaper) ActorRef(akka.actor.ActorRef) ContainerSpecification(org.apache.flink.runtime.clusterframework.ContainerSpecification) Props(akka.actor.Props) URL(java.net.URL) ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) MesosArtifactServer(org.apache.flink.mesos.util.MesosArtifactServer) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) MesosWorkerStore(org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore) MesosServices(org.apache.flink.mesos.runtime.clusterframework.services.MesosServices)

Aggregations

ActorRef (akka.actor.ActorRef)383 Test (org.junit.Test)253 TestActorRef (akka.testkit.TestActorRef)124 TestKit (akka.testkit.javadsl.TestKit)84 ActorSystem (akka.actor.ActorSystem)53 FiniteDuration (scala.concurrent.duration.FiniteDuration)53 Props (akka.actor.Props)46 Timeout (akka.util.Timeout)43 Configuration (org.apache.flink.configuration.Configuration)42 AbstractShardManagerTest (org.opendaylight.controller.cluster.datastore.AbstractShardManagerTest)38 UpdateSchemaContext (org.opendaylight.controller.cluster.datastore.messages.UpdateSchemaContext)37 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)33 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)30 ActorInitialized (org.opendaylight.controller.cluster.datastore.messages.ActorInitialized)26 NormalizedNodeAggregatorTest (org.opendaylight.controller.cluster.datastore.utils.NormalizedNodeAggregatorTest)26 AddressFromURIString (akka.actor.AddressFromURIString)24 ArrayList (java.util.ArrayList)22 JobID (org.apache.flink.api.common.JobID)22 IOException (java.io.IOException)21 RoleChangeNotification (org.opendaylight.controller.cluster.notifications.RoleChangeNotification)20