use of org.apache.flink.runtime.util.ExecutorThreadFactory in project flink by apache.
the class MesosApplicationMasterRunner method runPrivileged.
// ------------------------------------------------------------------------
// Core work method
// ------------------------------------------------------------------------
/**
* The main work method, must run as a privileged action.
*
* @return The return code for the Java process.
*/
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
ActorSystem actorSystem = null;
WebMonitor webMonitor = null;
MesosArtifactServer artifactServer = null;
ScheduledExecutorService futureExecutor = null;
ExecutorService ioExecutor = null;
MesosServices mesosServices = null;
try {
// ------- (1) load and parse / validate all configurations -------
// Note that we use the "appMasterHostname" given by the system, to make sure
// we use the hostnames consistently throughout akka.
// for akka "localhost" and "localhost.localdomain" are different actors.
final String appMasterHostname = InetAddress.getLocalHost().getHostName();
// Mesos configuration
final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
// JM configuration
int numberProcessors = Hardware.getNumberCPUCores();
futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
mesosServices = MesosServicesUtils.createMesosServices(config);
// TM configuration
final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
// JM endpoint, which should be explicitly configured based on acquired net resources
final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
// ----------------- (2) start the actor system -------------------
// try to start the actor system, JobManager and JobManager actor system
// using the configured address and ports
actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
Address address = AkkaUtils.getAddress(actorSystem);
final String akkaHostname = address.host().get();
final int akkaPort = (Integer) address.port().get();
LOG.info("Actor system bound to hostname {}.", akkaHostname);
// try to start the artifact server
LOG.debug("Starting Artifact Server");
final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
final String artifactServerPrefix = UUID.randomUUID().toString();
artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
// ----------------- (3) Generate the configuration for the TaskManagers -------------------
// generate a container spec which conveys the artifacts/vars needed to launch a TM
ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
// propagate the AM dynamic configuration to the TM
taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
// propagate newly-generated configuration elements
final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
// apply the overlays
applyOverlays(config, taskManagerContainerSpec);
// configure the artifact server to serve the specified artifacts
configureArtifactServer(artifactServer, taskManagerContainerSpec);
// ----------------- (4) start the actors -------------------
// 1) JobManager & Archive (in non-HA case, the leader service takes this)
// 2) Web Monitor (we need its port to register)
// 3) Resource Master for Mesos
// 4) Process reapers for the JobManager and Resource Master
// 1: the JobManager
LOG.debug("Starting JobManager actor");
// we start the JobManager with its standard name
ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
// 2: the web monitor
LOG.debug("Starting Web Frontend");
webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
if (webMonitor != null) {
final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
}
// 3: Flink's Mesos ResourceManager
LOG.debug("Starting Mesos Flink Resource Manager");
// create the worker store to persist task information across restarts
MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
// we need the leader retrieval service here to be informed of new
// leader session IDs, even though there can be only one leader ever
LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
// 4: Process reapers
// The process reapers ensure that upon unexpected actor death, the process exits
// and does not stay lingering around unresponsive
LOG.debug("Starting process reapers for JobManager");
actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
} catch (Throwable t) {
// make sure that everything whatever ends up in the log
LOG.error("Mesos JobManager initialization failed", t);
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable ignored) {
LOG.warn("Failed to stop the web frontend", ignored);
}
}
if (artifactServer != null) {
try {
artifactServer.stop();
} catch (Throwable ignored) {
LOG.error("Failed to stop the artifact server", ignored);
}
}
if (actorSystem != null) {
try {
actorSystem.shutdown();
} catch (Throwable tt) {
LOG.error("Error shutting down actor system", tt);
}
}
if (futureExecutor != null) {
try {
futureExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down future executor", tt);
}
}
if (ioExecutor != null) {
try {
ioExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down io executor", tt);
}
}
if (mesosServices != null) {
try {
mesosServices.close(false);
} catch (Throwable tt) {
LOG.error("Error closing the mesos services.", tt);
}
}
return INIT_ERROR_EXIT_CODE;
}
// everything started, we can wait until all is done or the process is killed
LOG.info("Mesos JobManager started");
// wait until everything is done
actorSystem.awaitTermination();
// if we get here, everything work out jolly all right, and we even exited smoothly
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the web frontend", t);
}
}
try {
artifactServer.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the artifact server", t);
}
org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
try {
mesosServices.close(true);
} catch (Throwable t) {
LOG.error("Failed to clean up and close MesosServices.", t);
}
return 0;
}
use of org.apache.flink.runtime.util.ExecutorThreadFactory in project flink by apache.
the class YarnApplicationMasterRunner method runApplicationMaster.
// ------------------------------------------------------------------------
// Core work method
// ------------------------------------------------------------------------
/**
* The main work method, must run as a privileged action.
*
* @return The return code for the Java process.
*/
protected int runApplicationMaster(Configuration config) {
ActorSystem actorSystem = null;
WebMonitor webMonitor = null;
int numberProcessors = Hardware.getNumberCPUCores();
final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-future"));
final ExecutorService ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("yarn-jobmanager-io"));
try {
// ------- (1) load and parse / validate all configurations -------
// loading all config values here has the advantage that the program fails fast, if any
// configuration problem occurs
final String currDir = ENV.get(Environment.PWD.key());
require(currDir != null, "Current working directory variable (%s) not set", Environment.PWD.key());
// Note that we use the "appMasterHostname" given by YARN here, to make sure
// we use the hostnames given by YARN consistently throughout akka.
// for akka "localhost" and "localhost.localdomain" are different actors.
final String appMasterHostname = ENV.get(Environment.NM_HOST.key());
require(appMasterHostname != null, "ApplicationMaster hostname variable %s not set", Environment.NM_HOST.key());
LOG.info("YARN assigned hostname for application master: {}", appMasterHostname);
//Update keytab and principal path to reflect YARN container path location
final String remoteKeytabPath = ENV.get(YarnConfigKeys.KEYTAB_PATH);
final String remoteKeytabPrincipal = ENV.get(YarnConfigKeys.KEYTAB_PRINCIPAL);
String keytabPath = null;
if (remoteKeytabPath != null) {
File f = new File(currDir, Utils.KEYTAB_FILE_NAME);
keytabPath = f.getAbsolutePath();
LOG.info("keytabPath: {}", keytabPath);
}
if (keytabPath != null && remoteKeytabPrincipal != null) {
config.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, keytabPath);
config.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, remoteKeytabPrincipal);
}
// Hadoop/Yarn configuration (loads config data automatically from classpath files)
final YarnConfiguration yarnConfig = new YarnConfiguration();
final int taskManagerContainerMemory;
final int numInitialTaskManagers;
final int slotsPerTaskManager;
try {
taskManagerContainerMemory = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_MEMORY));
} catch (NumberFormatException e) {
throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_MEMORY + " : " + e.getMessage());
}
try {
numInitialTaskManagers = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_TM_COUNT));
} catch (NumberFormatException e) {
throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_TM_COUNT + " : " + e.getMessage());
}
try {
slotsPerTaskManager = Integer.parseInt(ENV.get(YarnConfigKeys.ENV_SLOTS));
} catch (NumberFormatException e) {
throw new RuntimeException("Invalid value for " + YarnConfigKeys.ENV_SLOTS + " : " + e.getMessage());
}
final ContaineredTaskManagerParameters taskManagerParameters = ContaineredTaskManagerParameters.create(config, taskManagerContainerMemory, slotsPerTaskManager);
LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.numSlots());
LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB", taskManagerParameters.taskManagerTotalMemoryMB(), taskManagerParameters.taskManagerHeapSizeMB(), taskManagerParameters.taskManagerDirectMemoryLimitMB());
// ----------------- (2) start the actor system -------------------
// try to start the actor system, JobManager and JobManager actor system
// using the port range definition from the config.
final String amPortRange = config.getString(ConfigConstants.YARN_APPLICATION_MASTER_PORT, ConfigConstants.DEFAULT_YARN_JOB_MANAGER_PORT);
actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, amPortRange, LOG);
final String akkaHostname = AkkaUtils.getAddress(actorSystem).host().get();
final int akkaPort = (Integer) AkkaUtils.getAddress(actorSystem).port().get();
LOG.info("Actor system bound to hostname {}.", akkaHostname);
// ---- (3) Generate the configuration for the TaskManagers
final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(config, akkaHostname, akkaPort, slotsPerTaskManager, TASKMANAGER_REGISTRATION_TIMEOUT);
LOG.debug("TaskManager configuration: {}", taskManagerConfig);
final ContainerLaunchContext taskManagerContext = Utils.createTaskExecutorContext(config, yarnConfig, ENV, taskManagerParameters, taskManagerConfig, currDir, getTaskManagerClass(), LOG);
// ---- (4) start the actors and components in this order:
// 1) JobManager & Archive (in non-HA case, the leader service takes this)
// 2) Web Monitor (we need its port to register)
// 3) Resource Master for YARN
// 4) Process reapers for the JobManager and Resource Master
// 1: the JobManager
LOG.debug("Starting JobManager actor");
// we start the JobManager with its standard name
ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new Some<>(JobManager.JOB_MANAGER_NAME()), Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
// 2: the web monitor
LOG.debug("Starting Web Frontend");
webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
String protocol = "http://";
if (config.getBoolean(ConfigConstants.JOB_MANAGER_WEB_SSL_ENABLED, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_SSL_ENABLED) && SSLUtils.getSSLEnabled(config)) {
protocol = "https://";
}
final String webMonitorURL = webMonitor == null ? null : protocol + appMasterHostname + ":" + webMonitor.getServerPort();
// 3: Flink's Yarn ResourceManager
LOG.debug("Starting YARN Flink Resource Manager");
// we need the leader retrieval service here to be informed of new leaders and session IDs
LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
Props resourceMasterProps = YarnFlinkResourceManager.createActorProps(getResourceManagerClass(), config, yarnConfig, leaderRetriever, appMasterHostname, webMonitorURL, taskManagerParameters, taskManagerContext, numInitialTaskManagers, LOG);
ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps);
// 4: Process reapers
// The process reapers ensure that upon unexpected actor death, the process exits
// and does not stay lingering around unresponsive
LOG.debug("Starting process reapers for JobManager and YARN Application Master");
actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "YARN_Resource_Master_Process_Reaper");
actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
} catch (Throwable t) {
// make sure that everything whatever ends up in the log
LOG.error("YARN Application Master initialization failed", t);
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable ignored) {
LOG.warn("Failed to stop the web frontend", t);
}
}
if (actorSystem != null) {
try {
actorSystem.shutdown();
} catch (Throwable tt) {
LOG.error("Error shutting down actor system", tt);
}
}
futureExecutor.shutdownNow();
ioExecutor.shutdownNow();
return INIT_ERROR_EXIT_CODE;
}
// everything started, we can wait until all is done or the process is killed
LOG.info("YARN Application Master started");
// wait until everything is done
actorSystem.awaitTermination();
// if we get here, everything work out jolly all right, and we even exited smoothly
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the web frontend", t);
}
}
org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
return 0;
}
use of org.apache.flink.runtime.util.ExecutorThreadFactory in project flink by apache.
the class JobManagerServices method fromConfiguration.
// ------------------------------------------------------------------------
// Creating the components from a configuration
// ------------------------------------------------------------------------
public static JobManagerServices fromConfiguration(Configuration config, HighAvailabilityServices haServices) throws Exception {
final BlobServer blobServer = new BlobServer(config, haServices);
final long cleanupInterval = config.getLong(ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL, ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000;
final BlobLibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval);
final FiniteDuration timeout;
try {
timeout = AkkaUtils.getTimeout(config);
} catch (NumberFormatException e) {
throw new IllegalConfigurationException(AkkaUtils.formatDurationParingErrorMessage());
}
final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(Hardware.getNumberCPUCores(), new ExecutorThreadFactory("jobmanager-future"));
return new JobManagerServices(futureExecutor, libraryCacheManager, RestartStrategyFactory.createRestartStrategyFactory(config), Time.of(timeout.length(), timeout.unit()));
}
Aggregations