use of org.apache.flink.mesos.util.MesosConfiguration in project flink by apache.
the class MesosApplicationMasterRunner method runPrivileged.
// ------------------------------------------------------------------------
// Core work method
// ------------------------------------------------------------------------
/**
* The main work method, must run as a privileged action.
*
* @return The return code for the Java process.
*/
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
ActorSystem actorSystem = null;
WebMonitor webMonitor = null;
MesosArtifactServer artifactServer = null;
ScheduledExecutorService futureExecutor = null;
ExecutorService ioExecutor = null;
MesosServices mesosServices = null;
try {
// ------- (1) load and parse / validate all configurations -------
// Note that we use the "appMasterHostname" given by the system, to make sure
// we use the hostnames consistently throughout akka.
// for akka "localhost" and "localhost.localdomain" are different actors.
final String appMasterHostname = InetAddress.getLocalHost().getHostName();
// Mesos configuration
final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
// JM configuration
int numberProcessors = Hardware.getNumberCPUCores();
futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
mesosServices = MesosServicesUtils.createMesosServices(config);
// TM configuration
final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
// JM endpoint, which should be explicitly configured based on acquired net resources
final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
// ----------------- (2) start the actor system -------------------
// try to start the actor system, JobManager and JobManager actor system
// using the configured address and ports
actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
Address address = AkkaUtils.getAddress(actorSystem);
final String akkaHostname = address.host().get();
final int akkaPort = (Integer) address.port().get();
LOG.info("Actor system bound to hostname {}.", akkaHostname);
// try to start the artifact server
LOG.debug("Starting Artifact Server");
final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
final String artifactServerPrefix = UUID.randomUUID().toString();
artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
// ----------------- (3) Generate the configuration for the TaskManagers -------------------
// generate a container spec which conveys the artifacts/vars needed to launch a TM
ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
// propagate the AM dynamic configuration to the TM
taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
// propagate newly-generated configuration elements
final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
// apply the overlays
applyOverlays(config, taskManagerContainerSpec);
// configure the artifact server to serve the specified artifacts
configureArtifactServer(artifactServer, taskManagerContainerSpec);
// ----------------- (4) start the actors -------------------
// 1) JobManager & Archive (in non-HA case, the leader service takes this)
// 2) Web Monitor (we need its port to register)
// 3) Resource Master for Mesos
// 4) Process reapers for the JobManager and Resource Master
// 1: the JobManager
LOG.debug("Starting JobManager actor");
// we start the JobManager with its standard name
ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
// 2: the web monitor
LOG.debug("Starting Web Frontend");
webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
if (webMonitor != null) {
final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
}
// 3: Flink's Mesos ResourceManager
LOG.debug("Starting Mesos Flink Resource Manager");
// create the worker store to persist task information across restarts
MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
// we need the leader retrieval service here to be informed of new
// leader session IDs, even though there can be only one leader ever
LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
// 4: Process reapers
// The process reapers ensure that upon unexpected actor death, the process exits
// and does not stay lingering around unresponsive
LOG.debug("Starting process reapers for JobManager");
actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
} catch (Throwable t) {
// make sure that everything whatever ends up in the log
LOG.error("Mesos JobManager initialization failed", t);
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable ignored) {
LOG.warn("Failed to stop the web frontend", ignored);
}
}
if (artifactServer != null) {
try {
artifactServer.stop();
} catch (Throwable ignored) {
LOG.error("Failed to stop the artifact server", ignored);
}
}
if (actorSystem != null) {
try {
actorSystem.shutdown();
} catch (Throwable tt) {
LOG.error("Error shutting down actor system", tt);
}
}
if (futureExecutor != null) {
try {
futureExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down future executor", tt);
}
}
if (ioExecutor != null) {
try {
ioExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down io executor", tt);
}
}
if (mesosServices != null) {
try {
mesosServices.close(false);
} catch (Throwable tt) {
LOG.error("Error closing the mesos services.", tt);
}
}
return INIT_ERROR_EXIT_CODE;
}
// everything started, we can wait until all is done or the process is killed
LOG.info("Mesos JobManager started");
// wait until everything is done
actorSystem.awaitTermination();
// if we get here, everything work out jolly all right, and we even exited smoothly
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the web frontend", t);
}
}
try {
artifactServer.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the artifact server", t);
}
org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
try {
mesosServices.close(true);
} catch (Throwable t) {
LOG.error("Failed to clean up and close MesosServices.", t);
}
return 0;
}
use of org.apache.flink.mesos.util.MesosConfiguration in project flink by apache.
the class MesosApplicationMasterRunner method createMesosConfig.
/**
* Loads and validates the ResourceManager Mesos configuration from the given Flink configuration.
*/
public static MesosConfiguration createMesosConfig(Configuration flinkConfig, String hostname) {
Protos.FrameworkInfo.Builder frameworkInfo = Protos.FrameworkInfo.newBuilder().setHostname(hostname);
Protos.Credential.Builder credential = null;
if (!flinkConfig.containsKey(ConfigConstants.MESOS_MASTER_URL)) {
throw new IllegalConfigurationException(ConfigConstants.MESOS_MASTER_URL + " must be configured.");
}
String masterUrl = flinkConfig.getString(ConfigConstants.MESOS_MASTER_URL, null);
Duration failoverTimeout = FiniteDuration.apply(flinkConfig.getInteger(ConfigConstants.MESOS_FAILOVER_TIMEOUT_SECONDS, ConfigConstants.DEFAULT_MESOS_FAILOVER_TIMEOUT_SECS), TimeUnit.SECONDS);
frameworkInfo.setFailoverTimeout(failoverTimeout.toSeconds());
frameworkInfo.setName(flinkConfig.getString(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_NAME, ConfigConstants.DEFAULT_MESOS_RESOURCEMANAGER_FRAMEWORK_NAME));
frameworkInfo.setRole(flinkConfig.getString(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_ROLE, ConfigConstants.DEFAULT_MESOS_RESOURCEMANAGER_FRAMEWORK_ROLE));
frameworkInfo.setUser(flinkConfig.getString(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_USER, ConfigConstants.DEFAULT_MESOS_RESOURCEMANAGER_FRAMEWORK_USER));
if (flinkConfig.containsKey(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_PRINCIPAL)) {
frameworkInfo.setPrincipal(flinkConfig.getString(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_PRINCIPAL, null));
credential = Protos.Credential.newBuilder();
credential.setPrincipal(frameworkInfo.getPrincipal());
// and thus don't set the 'secret' configuration setting
if (flinkConfig.containsKey(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_SECRET)) {
credential.setSecret(flinkConfig.getString(ConfigConstants.MESOS_RESOURCEMANAGER_FRAMEWORK_SECRET, null));
}
}
MesosConfiguration mesos = new MesosConfiguration(masterUrl, frameworkInfo, scala.Option.apply(credential));
return mesos;
}
use of org.apache.flink.mesos.util.MesosConfiguration in project flink by apache.
the class MesosFlinkResourceManager method initialize.
// ------------------------------------------------------------------------
// Mesos-specific behavior
// ------------------------------------------------------------------------
@Override
protected void initialize() throws Exception {
LOG.info("Initializing Mesos resource master");
workerStore.start();
// create the scheduler driver to communicate with Mesos
schedulerCallbackHandler = new SchedulerProxy(self());
// register with Mesos
FrameworkInfo.Builder frameworkInfo = mesosConfig.frameworkInfo().clone().setCheckpoint(true);
Option<Protos.FrameworkID> frameworkID = workerStore.getFrameworkID();
if (frameworkID.isEmpty()) {
LOG.info("Registering as new framework.");
} else {
LOG.info("Recovery scenario: re-registering using framework ID {}.", frameworkID.get().getValue());
frameworkInfo.setId(frameworkID.get());
}
MesosConfiguration initializedMesosConfig = mesosConfig.withFrameworkInfo(frameworkInfo);
MesosConfiguration.logMesosConfig(LOG, initializedMesosConfig);
schedulerDriver = initializedMesosConfig.createDriver(schedulerCallbackHandler, false);
// create supporting actors
connectionMonitor = createConnectionMonitor();
launchCoordinator = createLaunchCoordinator();
reconciliationCoordinator = createReconciliationCoordinator();
taskRouter = createTaskRouter();
recoverWorkers();
connectionMonitor.tell(new ConnectionMonitor.Start(), self());
schedulerDriver.start();
}
Aggregations