Search in sources :

Example 51 with ActorSystem

use of akka.actor.ActorSystem in project flink by apache.

the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.

/**
	 * Makes sure that all components are shut down when the TaskManager
	 * actor is shut down.
	 */
@Test
public void testComponentsStartupShutdown() {
    final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
    final Time timeout = Time.seconds(100);
    final int BUFFER_SIZE = 32 * 1024;
    Configuration config = new Configuration();
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
    config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(config);
        final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
        final int numberOfSlots = 1;
        // create the components for the TaskManager manually
        final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
        1000000, config, // exit-jvm-on-fatal-error
        false);
        final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
        ResourceID taskManagerId = ResourceID.generate();
        final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
        final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
        final IOManager ioManager = new IOManagerAsync(TMP_DIR);
        final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
        network.start();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
        // create the task manager
        final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                // wait for the TaskManager to be registered
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // shut down all actors and the actor system
        // Kill the Task down the JobManager
        taskManager.tell(Kill.getInstance(), ActorRef.noSender());
        jobManager.tell(Kill.getInstance(), ActorRef.noSender());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
        actorSystem = null;
        // now that the TaskManager is shut down, the components should be shut down as well
        assertTrue(network.isShutdown());
        assertTrue(ioManager.isProperlyShutDown());
        assertTrue(memManager.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) ActorRef(akka.actor.ActorRef) Time(org.apache.flink.api.common.time.Time) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Props(akka.actor.Props) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FiniteDuration(scala.concurrent.duration.FiniteDuration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) NetworkBufferPool(org.apache.flink.runtime.io.network.buffer.NetworkBufferPool) LocalConnectionManager(org.apache.flink.runtime.io.network.LocalConnectionManager) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 52 with ActorSystem

use of akka.actor.ActorSystem in project flink by apache.

the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.

/**
	 * Triggers a savepoint for a job that uses the FsStateBackend. We expect
	 * that all checkpoint files are written to a new savepoint directory.
	 *
	 * <ol>
	 * <li>Submit job, wait for some progress</li>
	 * <li>Trigger savepoint and verify that savepoint has been created</li>
	 * <li>Shut down the cluster, re-submit the job from the savepoint,
	 * verify that the initial state has been reset, and
	 * all tasks are running again</li>
	 * <li>Cancel job, dispose the savepoint, and verify that everything
	 * has been cleaned up</li>
	 * </ol>
	 */
@Test
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
    // Config
    final int numTaskManagers = 2;
    final int numSlotsPerTaskManager = 2;
    final int parallelism = numTaskManagers * numSlotsPerTaskManager;
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File testRoot = folder.newFolder();
    TestingCluster flink = null;
    try {
        // Create a test actor system
        ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        final File checkpointDir = new File(testRoot, "checkpoints");
        final File savepointRootDir = new File(testRoot, "savepoints");
        if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
            fail("Test setup failed: failed to create temporary directories.");
        }
        // Use file based checkpoints
        config.setString(CoreOptions.STATE_BACKEND, "filesystem");
        config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
        config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
        // Start Flink
        flink = new TestingCluster(config);
        flink.start(true);
        // Submit the job
        final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
        final JobID jobId = jobGraph.getJobID();
        // Reset the static test job helpers
        StatefulCounter.resetForTest(parallelism);
        // Retrieve the job manager
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");
        flink.submitJobDetached(jobGraph);
        LOG.info("Waiting for some progress.");
        // wait for the JobManager to be ready
        Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
        Await.ready(allRunning, deadline.timeLeft());
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        LOG.info("Triggering a savepoint.");
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
        final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        LOG.info("Retrieved savepoint path: " + savepointPath + ".");
        // Retrieve the savepoint from the testing job manager
        LOG.info("Requesting the savepoint.");
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
        // - Verification START -------------------------------------------
        // Only one savepoint should exist
        File[] files = savepointRootDir.listFiles();
        if (files != null) {
            assertEquals("Savepoint not created in expected directory", 1, files.length);
            assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
            File savepointDir = files[0];
            File[] savepointFiles = savepointDir.listFiles();
            assertNotNull(savepointFiles);
            // Expect one metadata file and one checkpoint file per stateful
            // parallel subtask
            String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
            assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
        } else {
            fail("Savepoint not created in expected directory");
        }
        // We currently have the following directory layout: checkpointDir/jobId/chk-ID
        File jobCheckpoints = new File(checkpointDir, jobId.toString());
        if (jobCheckpoints.exists()) {
            files = jobCheckpoints.listFiles();
            assertNotNull("Checkpoint directory empty", files);
            assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
        }
        // - Verification END ---------------------------------------------
        // Restart the cluster
        LOG.info("Restarting Flink cluster.");
        flink.start();
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        final Throwable[] error = new Throwable[1];
        final TestingCluster finalFlink = flink;
        final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
        new JavaTestKit(testActorSystem) {

            {
                new Within(deadline.timeLeft()) {

                    @Override
                    protected void run() {
                        try {
                            // Register to all submit task messages for job
                            for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
                                taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
                            }
                            // Set the savepoint path
                            jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
                            LOG.info("Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
                            // Submit the job
                            finalFlink.submitJobDetached(jobGraph);
                            int numTasks = 0;
                            for (JobVertex jobVertex : jobGraph.getVertices()) {
                                numTasks += jobVertex.getParallelism();
                            }
                            // Gather the task deployment descriptors
                            LOG.info("Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
                            for (int i = 0; i < numTasks; i++) {
                                ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
                                TaskDeploymentDescriptor tdd = resp.tdd();
                                LOG.info("Received: " + tdd.toString() + ".");
                                TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
                                tdds.put(taskInformation.getJobVertexId(), tdd);
                            }
                        } catch (Throwable t) {
                            error[0] = t;
                        }
                    }
                };
            }
        };
        // - Verification START -------------------------------------------
        String errMsg = "Error during gathering of TaskDeploymentDescriptors";
        assertNull(errMsg, error[0]);
        // have a matching task deployment descriptor.
        for (TaskState taskState : savepoint.getTaskStates()) {
            Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
            errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
            assertTrue(errMsg, taskTdds.size() > 0);
            assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
            for (TaskDeploymentDescriptor tdd : taskTdds) {
                SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
                assertNotNull(subtaskState);
                errMsg = "Initial operator state mismatch.";
                assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
            }
        }
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // - Verification END ---------------------------------------------
        LOG.info("Cancelling job " + jobId + ".");
        jobManager.tell(new CancelJob(jobId));
        LOG.info("Disposing savepoint " + savepointPath + ".");
        Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
        errMsg = "Failed to dispose savepoint " + savepointPath + ".";
        Object resp = Await.result(disposeFuture, deadline.timeLeft());
        assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
        // - Verification START -------------------------------------------
        // The checkpoint files
        List<File> checkpointFiles = new ArrayList<>();
        for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
            for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
                ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
                for (int i = 0; i < streamTaskState.getLength(); i++) {
                    if (streamTaskState.get(i) != null) {
                        FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
                        checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
                    }
                }
            }
        }
        // The checkpoint files of the savepoint should have been discarded
        for (File f : checkpointFiles) {
            errMsg = "Checkpoint file " + f + " not cleaned up properly.";
            assertFalse(errMsg, f.exists());
        }
        if (checkpointFiles.size() > 0) {
            File parent = checkpointFiles.get(0).getParentFile();
            errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
            assertFalse(errMsg, parent.exists());
        }
        // All savepoints should have been cleaned up
        errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
        assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
    // - Verification END ---------------------------------------------
    } finally {
        if (flink != null) {
            flink.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) ResponseSubmitTaskListener(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages.ResponseSubmitTaskListener) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) SavepointV1(org.apache.flink.runtime.checkpoint.savepoint.SavepointV1) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) FileStateHandle(org.apache.flink.runtime.state.filesystem.FileStateHandle) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) TriggerSavepointSuccess(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepointSuccess) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) SubtaskState(org.apache.flink.runtime.checkpoint.SubtaskState) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) File(java.io.File) TaskState(org.apache.flink.runtime.checkpoint.TaskState) JobID(org.apache.flink.api.common.JobID) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 53 with ActorSystem

use of akka.actor.ActorSystem in project flink by apache.

the class JobRetrievalITCase method testJobRetrieval.

@Test
public void testJobRetrieval() throws Exception {
    final JobID jobID = new JobID();
    final JobVertex imalock = new JobVertex("imalock");
    imalock.setInvokableClass(SemaphoreInvokable.class);
    final JobGraph jobGraph = new JobGraph(jobID, "testjob", imalock);
    final ClusterClient client = new StandaloneClusterClient(cluster.configuration());
    // acquire the lock to make sure that the job cannot complete until the job client
    // has been attached in resumingThread
    lock.acquire();
    client.runDetached(jobGraph, JobRetrievalITCase.class.getClassLoader());
    final Thread resumingThread = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                assertNotNull(client.retrieveJob(jobID));
            } catch (Throwable e) {
                fail(e.getMessage());
            }
        }
    });
    final Seq<ActorSystem> actorSystemSeq = cluster.jobManagerActorSystems().get();
    final ActorSystem actorSystem = actorSystemSeq.last();
    JavaTestKit testkit = new JavaTestKit(actorSystem);
    final ActorRef jm = cluster.getJobManagersAsJava().get(0);
    // wait until client connects
    jm.tell(TestingJobManagerMessages.getNotifyWhenClientConnects(), testkit.getRef());
    // confirm registration
    testkit.expectMsgEquals(true);
    // kick off resuming
    resumingThread.start();
    // wait for client to connect
    testkit.expectMsgAllOf(TestingJobManagerMessages.getClientConnected(), TestingJobManagerMessages.getClassLoadingPropsDelivered());
    // client has connected, we can release the lock
    lock.release();
    resumingThread.join();
}
Also used : ActorSystem(akka.actor.ActorSystem) ActorRef(akka.actor.ActorRef) StandaloneClusterClient(org.apache.flink.client.program.StandaloneClusterClient) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StandaloneClusterClient(org.apache.flink.client.program.StandaloneClusterClient) ClusterClient(org.apache.flink.client.program.ClusterClient) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobID(org.apache.flink.api.common.JobID) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 54 with ActorSystem

use of akka.actor.ActorSystem in project flink by apache.

the class YarnApplicationMasterRunner method runApplicationMaster.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runApplicationMaster(Configuration config) {
    ActorSystem actorSystem = null;
    WebMonitor webMonitor = null;
    int numberProcessors = Hardware.getNumberCPUCores();
    final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-future"));
    final ExecutorService ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-io"));
    try {
        // ------- (1) load and parse / validate all configurations -------
        // loading all config values here has the advantage that the program fails fast, if any
        // configuration problem occurs
        final String currDir = ENV.get(Environment.PWD.key());
        require(currDir != null, "Current working directory variable (%s) not set", Environment.PWD.key());
        // Note that we use the "appMasterHostname" given by YARN here, to make sure
        // we use the hostnames given by YARN consistently throughout akka.
        // for akka "localhost" and "localhost.localdomain" are different actors.
        final String appMasterHostname = ENV.get(Environment.NM_HOST.key());
        require(appMasterHostname != null, "ApplicationMaster hostname variable %s not set", Environment.NM_HOST.key());
        LOG.info("YARN assigned hostname for application master: {}", appMasterHostname);
        //Update keytab and principal path to reflect YARN container path location
        final String remoteKeytabPath = ENV.get(YarnConfigKeys.KEYTAB_PATH);
        final String remoteKeytabPrincipal = ENV.get(YarnConfigKeys.KEYTAB_PRINCIPAL);
        String keytabPath = null;
        if (remoteKeytabPath != null) {
            File f = new File(currDir, Utils.KEYTAB_FILE_NAME);
            keytabPath = f.getAbsolutePath();
            LOG.info("keytabPath: {}", keytabPath);
        }
        if (keytabPath != null && remoteKeytabPrincipal != null) {
            config.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, keytabPath);
            config.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, remoteKeytabPrincipal);
        }
        // Hadoop/Yarn configuration (loads config data automatically from classpath files)
        final YarnConfiguration yarnConfig = new YarnConfiguration();
        final int taskManagerContainerMemory;
        final int numInitialTaskManagers;
        final int slotsPerTaskManager;
        try {
            taskManagerContainerMemory = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_MEMORY));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_MEMORY + " : " + e.getMessage());
        }
        try {
            numInitialTaskManagers = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_COUNT));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_COUNT + " : " + e.getMessage());
        }
        try {
            slotsPerTaskManager = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_SLOTS));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_SLOTS + " : " + e.getMessage());
        }
        final ContaineredTaskManagerParameters taskManagerParameters = ContaineredTaskManagerParameters.create(config, taskManagerContainerMemory, slotsPerTaskManager);
        LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.numSlots());
        LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB", taskManagerParameters.taskManagerTotalMemoryMB(), taskManagerParameters.taskManagerHeapSizeMB(), taskManagerParameters.taskManagerDirectMemoryLimitMB());
        // ----------------- (2) start the actor system -------------------
        // try to start the actor system, JobManager and JobManager actor system
        // using the port range definition from the config.
        final String amPortRange = config.getString(ConfigConstants.YARN_APPLICATION_MASTER_PORT, ConfigConstants.DEFAULT_YARN_JOB_MANAGER_PORT);
        actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, amPortRange, LOG);
        final String akkaHostname = AkkaUtils.getAddress(actorSystem).host().get();
        final int akkaPort = (Integer) AkkaUtils.getAddress(actorSystem).port().get();
        LOG.info("Actor system bound to hostname {}.", akkaHostname);
        // ---- (3) Generate the configuration for the TaskManagers
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(config, akkaHostname, akkaPort, slotsPerTaskManager, TASKMANAGER_REGISTRATION_TIMEOUT);
        LOG.debug("TaskManager configuration: {}", taskManagerConfig);
        final ContainerLaunchContext taskManagerContext = Utils.createTaskExecutorContext(config, yarnConfig, ENV, taskManagerParameters, taskManagerConfig, currDir, getTaskManagerClass(), LOG);
        // ---- (4) start the actors and components in this order:
        // 1) JobManager & Archive (in non-HA case, the leader service takes this)
        // 2) Web Monitor (we need its port to register)
        // 3) Resource Master for YARN
        // 4) Process reapers for the JobManager and Resource Master
        // 1: the JobManager
        LOG.debug("Starting JobManager actor");
        // we start the JobManager with its standard name
        ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new Some<>(JobManager.JOB_MANAGER_NAME()), Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
        // 2: the web monitor
        LOG.debug("Starting Web Frontend");
        webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
        String protocol = "http://";
        if (config.getBoolean(ConfigConstants.JOB_MANAGER_WEB_SSL_ENABLED, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_SSL_ENABLED) && SSLUtils.getSSLEnabled(config)) {
            protocol = "https://";
        }
        final String webMonitorURL = webMonitor == null ? null : protocol + appMasterHostname + ":" + webMonitor.getServerPort();
        // 3: Flink's Yarn ResourceManager
        LOG.debug("Starting YARN Flink Resource Manager");
        // we need the leader retrieval service here to be informed of new leaders and session IDs
        LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        Props resourceMasterProps = YarnFlinkResourceManager.createActorProps(getResourceManagerClass(), config, yarnConfig, leaderRetriever, appMasterHostname, webMonitorURL, taskManagerParameters, taskManagerContext, numInitialTaskManagers, LOG);
        ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps);
        // 4: Process reapers
        // The process reapers ensure that upon unexpected actor death, the process exits
        // and does not stay lingering around unresponsive
        LOG.debug("Starting process reapers for JobManager and YARN Application Master");
        actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "YARN_Resource_Master_Process_Reaper");
        actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("YARN Application Master initialization failed", t);
        if (webMonitor != null) {
            try {
                webMonitor.stop();
            } catch (Throwable ignored) {
                LOG.warn("Failed to stop the web frontend", t);
            }
        }
        if (actorSystem != null) {
            try {
                actorSystem.shutdown();
            } catch (Throwable tt) {
                LOG.error("Error shutting down actor system", tt);
            }
        }
        futureExecutor.shutdownNow();
        ioExecutor.shutdownNow();
        return INIT_ERROR_EXIT_CODE;
    }
    // everything started, we can wait until all is done or the process is killed
    LOG.info("YARN Application Master started");
    // wait until everything is done
    actorSystem.awaitTermination();
    // if we get here, everything work out jolly all right, and we even exited smoothly
    if (webMonitor != null) {
        try {
            webMonitor.stop();
        } catch (Throwable t) {
            LOG.error("Failed to stop the web frontend", t);
        }
    }
    org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
    return 0;
}
Also used : ActorSystem(akka.actor.ActorSystem) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ProcessReaper(org.apache.flink.runtime.process.ProcessReaper) ActorRef(akka.actor.ActorRef) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) Props(akka.actor.Props) ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) Some(scala.Some) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) File(java.io.File)

Example 55 with ActorSystem

use of akka.actor.ActorSystem in project flink by apache.

the class AkkaRpcServiceTest method testTerminationFuture.

/**
	 * Tests that we can wait for the termination of the rpc service
	 *
	 * @throws ExecutionException
	 * @throws InterruptedException
	 */
@Test(timeout = 60000)
public void testTerminationFuture() throws ExecutionException, InterruptedException {
    final ActorSystem actorSystem = AkkaUtils.createDefaultActorSystem();
    final AkkaRpcService rpcService = new AkkaRpcService(actorSystem, Time.milliseconds(1000));
    Future<Void> terminationFuture = rpcService.getTerminationFuture();
    assertFalse(terminationFuture.isDone());
    FlinkFuture.supplyAsync(new Callable<Void>() {

        @Override
        public Void call() throws Exception {
            rpcService.stopService();
            return null;
        }
    }, actorSystem.dispatcher());
    terminationFuture.get();
}
Also used : ActorSystem(akka.actor.ActorSystem) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Aggregations

ActorSystem (akka.actor.ActorSystem)91 ActorRef (akka.actor.ActorRef)54 Test (org.junit.Test)51 Configuration (org.apache.flink.configuration.Configuration)27 FiniteDuration (scala.concurrent.duration.FiniteDuration)12 File (java.io.File)11 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)11 LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)11 Props (akka.actor.Props)10 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)10 TestActorRef (akka.testkit.TestActorRef)8 IOException (java.io.IOException)8 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)8 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)8 Deadline (scala.concurrent.duration.Deadline)8 AddressFromURIString (akka.actor.AddressFromURIString)7 ActorMaterializer (akka.stream.ActorMaterializer)7 Materializer (akka.stream.Materializer)7 Sink (akka.stream.javadsl.Sink)7 Source (akka.stream.javadsl.Source)7