Search in sources :

Example 1 with WebMonitor

use of org.apache.flink.runtime.webmonitor.WebMonitor in project flink by apache.

the class MesosApplicationMasterRunner method runPrivileged.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
    ActorSystem actorSystem = null;
    WebMonitor webMonitor = null;
    MesosArtifactServer artifactServer = null;
    ScheduledExecutorService futureExecutor = null;
    ExecutorService ioExecutor = null;
    MesosServices mesosServices = null;
    try {
        // ------- (1) load and parse / validate all configurations -------
        // Note that we use the "appMasterHostname" given by the system, to make sure
        // we use the hostnames consistently throughout akka.
        // for akka "localhost" and "localhost.localdomain" are different actors.
        final String appMasterHostname = InetAddress.getLocalHost().getHostName();
        // Mesos configuration
        final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
        // JM configuration
        int numberProcessors = Hardware.getNumberCPUCores();
        futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
        ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
        mesosServices = MesosServicesUtils.createMesosServices(config);
        // TM configuration
        final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
        LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
        LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
        // JM endpoint, which should be explicitly configured based on acquired net resources
        final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
        checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
        // ----------------- (2) start the actor system -------------------
        // try to start the actor system, JobManager and JobManager actor system
        // using the configured address and ports
        actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
        Address address = AkkaUtils.getAddress(actorSystem);
        final String akkaHostname = address.host().get();
        final int akkaPort = (Integer) address.port().get();
        LOG.info("Actor system bound to hostname {}.", akkaHostname);
        // try to start the artifact server
        LOG.debug("Starting Artifact Server");
        final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
        final String artifactServerPrefix = UUID.randomUUID().toString();
        artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
        // ----------------- (3) Generate the configuration for the TaskManagers -------------------
        // generate a container spec which conveys the artifacts/vars needed to launch a TM
        ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
        // propagate the AM dynamic configuration to the TM
        taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
        // propagate newly-generated configuration elements
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
        taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
        // apply the overlays
        applyOverlays(config, taskManagerContainerSpec);
        // configure the artifact server to serve the specified artifacts
        configureArtifactServer(artifactServer, taskManagerContainerSpec);
        // ----------------- (4) start the actors -------------------
        // 1) JobManager & Archive (in non-HA case, the leader service takes this)
        // 2) Web Monitor (we need its port to register)
        // 3) Resource Master for Mesos
        // 4) Process reapers for the JobManager and Resource Master
        // 1: the JobManager
        LOG.debug("Starting JobManager actor");
        // we start the JobManager with its standard name
        ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
        // 2: the web monitor
        LOG.debug("Starting Web Frontend");
        webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
        if (webMonitor != null) {
            final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
            mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
        }
        // 3: Flink's Mesos ResourceManager
        LOG.debug("Starting Mesos Flink Resource Manager");
        // create the worker store to persist task information across restarts
        MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
        // we need the leader retrieval service here to be informed of new
        // leader session IDs, even though there can be only one leader ever
        LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
        ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
        // 4: Process reapers
        // The process reapers ensure that upon unexpected actor death, the process exits
        // and does not stay lingering around unresponsive
        LOG.debug("Starting process reapers for JobManager");
        actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
        actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("Mesos JobManager initialization failed", t);
        if (webMonitor != null) {
            try {
                webMonitor.stop();
            } catch (Throwable ignored) {
                LOG.warn("Failed to stop the web frontend", ignored);
            }
        }
        if (artifactServer != null) {
            try {
                artifactServer.stop();
            } catch (Throwable ignored) {
                LOG.error("Failed to stop the artifact server", ignored);
            }
        }
        if (actorSystem != null) {
            try {
                actorSystem.shutdown();
            } catch (Throwable tt) {
                LOG.error("Error shutting down actor system", tt);
            }
        }
        if (futureExecutor != null) {
            try {
                futureExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down future executor", tt);
            }
        }
        if (ioExecutor != null) {
            try {
                ioExecutor.shutdownNow();
            } catch (Throwable tt) {
                LOG.error("Error shutting down io executor", tt);
            }
        }
        if (mesosServices != null) {
            try {
                mesosServices.close(false);
            } catch (Throwable tt) {
                LOG.error("Error closing the mesos services.", tt);
            }
        }
        return INIT_ERROR_EXIT_CODE;
    }
    // everything started, we can wait until all is done or the process is killed
    LOG.info("Mesos JobManager started");
    // wait until everything is done
    actorSystem.awaitTermination();
    // if we get here, everything work out jolly all right, and we even exited smoothly
    if (webMonitor != null) {
        try {
            webMonitor.stop();
        } catch (Throwable t) {
            LOG.error("Failed to stop the web frontend", t);
        }
    }
    try {
        artifactServer.stop();
    } catch (Throwable t) {
        LOG.error("Failed to stop the artifact server", t);
    }
    org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
    try {
        mesosServices.close(true);
    } catch (Throwable t) {
        LOG.error("Failed to clean up and close MesosServices.", t);
    }
    return 0;
}
Also used : ActorSystem(akka.actor.ActorSystem) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) InetAddress(java.net.InetAddress) Address(akka.actor.Address) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ProcessReaper(org.apache.flink.runtime.process.ProcessReaper) ActorRef(akka.actor.ActorRef) ContainerSpecification(org.apache.flink.runtime.clusterframework.ContainerSpecification) Props(akka.actor.Props) URL(java.net.URL) ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) MesosConfiguration(org.apache.flink.mesos.util.MesosConfiguration) MesosArtifactServer(org.apache.flink.mesos.util.MesosArtifactServer) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) MesosWorkerStore(org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore) MesosServices(org.apache.flink.mesos.runtime.clusterframework.services.MesosServices)

Example 2 with WebMonitor

use of org.apache.flink.runtime.webmonitor.WebMonitor in project flink by apache.

the class BootstrapTools method startWebMonitorIfConfigured.

/**
	 * Starts the web frontend.
	 * @param config The Flink config.
	 * @param actorSystem The ActorSystem to start the web frontend in.
	 * @param logger Logger for log output
	 * @return WebMonitor instance.
	 * @throws Exception
	 */
public static WebMonitor startWebMonitorIfConfigured(Configuration config, ActorSystem actorSystem, ActorRef jobManager, Logger logger) throws Exception {
    // this ensures correct values are present in the web frontend
    final Address address = AkkaUtils.getAddress(actorSystem);
    config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, address.host().get());
    config.setString(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, address.port().get().toString());
    if (config.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0) >= 0) {
        logger.info("Starting JobManager Web Frontend");
        LeaderRetrievalService leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        // start the web frontend. we need to load this dynamically
        // because it is not in the same project/dependencies
        WebMonitor monitor = WebMonitorUtils.startWebRuntimeMonitor(config, leaderRetrievalService, actorSystem);
        // start the web monitor
        if (monitor != null) {
            String jobManagerAkkaURL = AkkaUtils.getAkkaURL(actorSystem, jobManager);
            monitor.start(jobManagerAkkaURL);
        }
        return monitor;
    } else {
        return null;
    }
}
Also used : Address(akka.actor.Address) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor)

Example 3 with WebMonitor

use of org.apache.flink.runtime.webmonitor.WebMonitor in project flink by apache.

the class YarnApplicationMasterRunner method runApplicationMaster.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runApplicationMaster(Configuration config) {
    ActorSystem actorSystem = null;
    WebMonitor webMonitor = null;
    int numberProcessors = Hardware.getNumberCPUCores();
    final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-future"));
    final ExecutorService ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-io"));
    try {
        // ------- (1) load and parse / validate all configurations -------
        // loading all config values here has the advantage that the program fails fast, if any
        // configuration problem occurs
        final String currDir = ENV.get(Environment.PWD.key());
        require(currDir != null, "Current working directory variable (%s) not set", Environment.PWD.key());
        // Note that we use the "appMasterHostname" given by YARN here, to make sure
        // we use the hostnames given by YARN consistently throughout akka.
        // for akka "localhost" and "localhost.localdomain" are different actors.
        final String appMasterHostname = ENV.get(Environment.NM_HOST.key());
        require(appMasterHostname != null, "ApplicationMaster hostname variable %s not set", Environment.NM_HOST.key());
        LOG.info("YARN assigned hostname for application master: {}", appMasterHostname);
        //Update keytab and principal path to reflect YARN container path location
        final String remoteKeytabPath = ENV.get(YarnConfigKeys.KEYTAB_PATH);
        final String remoteKeytabPrincipal = ENV.get(YarnConfigKeys.KEYTAB_PRINCIPAL);
        String keytabPath = null;
        if (remoteKeytabPath != null) {
            File f = new File(currDir, Utils.KEYTAB_FILE_NAME);
            keytabPath = f.getAbsolutePath();
            LOG.info("keytabPath: {}", keytabPath);
        }
        if (keytabPath != null && remoteKeytabPrincipal != null) {
            config.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, keytabPath);
            config.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, remoteKeytabPrincipal);
        }
        // Hadoop/Yarn configuration (loads config data automatically from classpath files)
        final YarnConfiguration yarnConfig = new YarnConfiguration();
        final int taskManagerContainerMemory;
        final int numInitialTaskManagers;
        final int slotsPerTaskManager;
        try {
            taskManagerContainerMemory = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_MEMORY));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_MEMORY + " : " + e.getMessage());
        }
        try {
            numInitialTaskManagers = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_COUNT));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_COUNT + " : " + e.getMessage());
        }
        try {
            slotsPerTaskManager = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_SLOTS));
        } catch (NumberFormatException e) {
            throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_SLOTS + " : " + e.getMessage());
        }
        final ContaineredTaskManagerParameters taskManagerParameters = ContaineredTaskManagerParameters.create(config, taskManagerContainerMemory, slotsPerTaskManager);
        LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.numSlots());
        LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB", taskManagerParameters.taskManagerTotalMemoryMB(), taskManagerParameters.taskManagerHeapSizeMB(), taskManagerParameters.taskManagerDirectMemoryLimitMB());
        // ----------------- (2) start the actor system -------------------
        // try to start the actor system, JobManager and JobManager actor system
        // using the port range definition from the config.
        final String amPortRange = config.getString(ConfigConstants.YARN_APPLICATION_MASTER_PORT, ConfigConstants.DEFAULT_YARN_JOB_MANAGER_PORT);
        actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, amPortRange, LOG);
        final String akkaHostname = AkkaUtils.getAddress(actorSystem).host().get();
        final int akkaPort = (Integer) AkkaUtils.getAddress(actorSystem).port().get();
        LOG.info("Actor system bound to hostname {}.", akkaHostname);
        // ---- (3) Generate the configuration for the TaskManagers
        final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(config, akkaHostname, akkaPort, slotsPerTaskManager, TASKMANAGER_REGISTRATION_TIMEOUT);
        LOG.debug("TaskManager configuration: {}", taskManagerConfig);
        final ContainerLaunchContext taskManagerContext = Utils.createTaskExecutorContext(config, yarnConfig, ENV, taskManagerParameters, taskManagerConfig, currDir, getTaskManagerClass(), LOG);
        // ---- (4) start the actors and components in this order:
        // 1) JobManager & Archive (in non-HA case, the leader service takes this)
        // 2) Web Monitor (we need its port to register)
        // 3) Resource Master for YARN
        // 4) Process reapers for the JobManager and Resource Master
        // 1: the JobManager
        LOG.debug("Starting JobManager actor");
        // we start the JobManager with its standard name
        ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new Some<>(JobManager.JOB_MANAGER_NAME()), Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
        // 2: the web monitor
        LOG.debug("Starting Web Frontend");
        webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
        String protocol = "http://";
        if (config.getBoolean(ConfigConstants.JOB_MANAGER_WEB_SSL_ENABLED, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_SSL_ENABLED) && SSLUtils.getSSLEnabled(config)) {
            protocol = "https://";
        }
        final String webMonitorURL = webMonitor == null ? null : protocol + appMasterHostname + ":" + webMonitor.getServerPort();
        // 3: Flink's Yarn ResourceManager
        LOG.debug("Starting YARN Flink Resource Manager");
        // we need the leader retrieval service here to be informed of new leaders and session IDs
        LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
        Props resourceMasterProps = YarnFlinkResourceManager.createActorProps(getResourceManagerClass(), config, yarnConfig, leaderRetriever, appMasterHostname, webMonitorURL, taskManagerParameters, taskManagerContext, numInitialTaskManagers, LOG);
        ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps);
        // 4: Process reapers
        // The process reapers ensure that upon unexpected actor death, the process exits
        // and does not stay lingering around unresponsive
        LOG.debug("Starting process reapers for JobManager and YARN Application Master");
        actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "YARN_Resource_Master_Process_Reaper");
        actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("YARN Application Master initialization failed", t);
        if (webMonitor != null) {
            try {
                webMonitor.stop();
            } catch (Throwable ignored) {
                LOG.warn("Failed to stop the web frontend", t);
            }
        }
        if (actorSystem != null) {
            try {
                actorSystem.shutdown();
            } catch (Throwable tt) {
                LOG.error("Error shutting down actor system", tt);
            }
        }
        futureExecutor.shutdownNow();
        ioExecutor.shutdownNow();
        return INIT_ERROR_EXIT_CODE;
    }
    // everything started, we can wait until all is done or the process is killed
    LOG.info("YARN Application Master started");
    // wait until everything is done
    actorSystem.awaitTermination();
    // if we get here, everything work out jolly all right, and we even exited smoothly
    if (webMonitor != null) {
        try {
            webMonitor.stop();
        } catch (Throwable t) {
            LOG.error("Failed to stop the web frontend", t);
        }
    }
    org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
    return 0;
}
Also used : ActorSystem(akka.actor.ActorSystem) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ProcessReaper(org.apache.flink.runtime.process.ProcessReaper) ActorRef(akka.actor.ActorRef) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) Props(akka.actor.Props) ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) Some(scala.Some) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) WebMonitor(org.apache.flink.runtime.webmonitor.WebMonitor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) File(java.io.File)

Aggregations

LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)3 WebMonitor (org.apache.flink.runtime.webmonitor.WebMonitor)3 ActorRef (akka.actor.ActorRef)2 ActorSystem (akka.actor.ActorSystem)2 Address (akka.actor.Address)2 Props (akka.actor.Props)2 ExecutorService (java.util.concurrent.ExecutorService)2 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)2 Configuration (org.apache.flink.configuration.Configuration)2 GlobalConfiguration (org.apache.flink.configuration.GlobalConfiguration)2 ProcessReaper (org.apache.flink.runtime.process.ProcessReaper)2 ExecutorThreadFactory (org.apache.flink.runtime.util.ExecutorThreadFactory)2 File (java.io.File)1 InetAddress (java.net.InetAddress)1 URL (java.net.URL)1 MesosServices (org.apache.flink.mesos.runtime.clusterframework.services.MesosServices)1 MesosWorkerStore (org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore)1 MesosArtifactServer (org.apache.flink.mesos.util.MesosArtifactServer)1 MesosConfiguration (org.apache.flink.mesos.util.MesosConfiguration)1 ContainerSpecification (org.apache.flink.runtime.clusterframework.ContainerSpecification)1