use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.
/**
* Simple checkpointed streaming sum.
*
* <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
* returns it to the main thread via a static variable. We wait until some checkpoints are
* completed and sanity check that the sources recover with an updated state to make sure that
* this test actually tests something.
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
// Config
final int checkpointingInterval = 200;
final int sequenceEnd = 5000;
final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
env.setParallelism(Parallelism);
env.enableCheckpointing(checkpointingInterval);
env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
ActorSystem testSystem = null;
final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
CompletedCheckpointsLatch.await();
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Wait to finish
FinalCountLatch.await();
assertEquals(expectedSum, (long) FinalCount.get());
for (int i = 0; i < Parallelism; i++) {
assertNotEquals(0, RecoveredStates.get(i));
}
} catch (Throwable t) {
// Reset all static state for test retries
CompletedCheckpointsLatch = new CountDownLatch(2);
RecoveredStates = new AtomicLongArray(Parallelism);
FinalCountLatch = new CountDownLatch(1);
FinalCount = new AtomicReference<>();
LastElement = -1;
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class MesosApplicationMasterRunner method runPrivileged.
// ------------------------------------------------------------------------
// Core work method
// ------------------------------------------------------------------------
/**
* The main work method, must run as a privileged action.
*
* @return The return code for the Java process.
*/
protected int runPrivileged(Configuration config, Configuration dynamicProperties) {
ActorSystem actorSystem = null;
WebMonitor webMonitor = null;
MesosArtifactServer artifactServer = null;
ScheduledExecutorService futureExecutor = null;
ExecutorService ioExecutor = null;
MesosServices mesosServices = null;
try {
// ------- (1) load and parse / validate all configurations -------
// Note that we use the "appMasterHostname" given by the system, to make sure
// we use the hostnames consistently throughout akka.
// for akka "localhost" and "localhost.localdomain" are different actors.
final String appMasterHostname = InetAddress.getLocalHost().getHostName();
// Mesos configuration
final MesosConfiguration mesosConfig = createMesosConfig(config, appMasterHostname);
// JM configuration
int numberProcessors = Hardware.getNumberCPUCores();
futureExecutor = Executors.newScheduledThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-future"));
ioExecutor = Executors.newFixedThreadPool(numberProcessors, new ExecutorThreadFactory("mesos-jobmanager-io"));
mesosServices = MesosServicesUtils.createMesosServices(config);
// TM configuration
final MesosTaskManagerParameters taskManagerParameters = MesosTaskManagerParameters.create(config);
LOG.info("TaskManagers will be created with {} task slots", taskManagerParameters.containeredParameters().numSlots());
LOG.info("TaskManagers will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB, {} cpus", taskManagerParameters.containeredParameters().taskManagerTotalMemoryMB(), taskManagerParameters.containeredParameters().taskManagerHeapSizeMB(), taskManagerParameters.containeredParameters().taskManagerDirectMemoryLimitMB(), taskManagerParameters.cpus());
// JM endpoint, which should be explicitly configured based on acquired net resources
final int listeningPort = config.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT);
checkState(listeningPort >= 0 && listeningPort <= 65536, "Config parameter \"" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY + "\" is invalid, it must be between 0 and 65536");
// ----------------- (2) start the actor system -------------------
// try to start the actor system, JobManager and JobManager actor system
// using the configured address and ports
actorSystem = BootstrapTools.startActorSystem(config, appMasterHostname, listeningPort, LOG);
Address address = AkkaUtils.getAddress(actorSystem);
final String akkaHostname = address.host().get();
final int akkaPort = (Integer) address.port().get();
LOG.info("Actor system bound to hostname {}.", akkaHostname);
// try to start the artifact server
LOG.debug("Starting Artifact Server");
final int artifactServerPort = config.getInteger(ConfigConstants.MESOS_ARTIFACT_SERVER_PORT_KEY, ConfigConstants.DEFAULT_MESOS_ARTIFACT_SERVER_PORT);
final String artifactServerPrefix = UUID.randomUUID().toString();
artifactServer = new MesosArtifactServer(artifactServerPrefix, akkaHostname, artifactServerPort, config);
// ----------------- (3) Generate the configuration for the TaskManagers -------------------
// generate a container spec which conveys the artifacts/vars needed to launch a TM
ContainerSpecification taskManagerContainerSpec = new ContainerSpecification();
// propagate the AM dynamic configuration to the TM
taskManagerContainerSpec.getDynamicConfiguration().addAll(dynamicProperties);
// propagate newly-generated configuration elements
final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(new Configuration(), akkaHostname, akkaPort, taskManagerParameters.containeredParameters().numSlots(), TASKMANAGER_REGISTRATION_TIMEOUT);
taskManagerContainerSpec.getDynamicConfiguration().addAll(taskManagerConfig);
// apply the overlays
applyOverlays(config, taskManagerContainerSpec);
// configure the artifact server to serve the specified artifacts
configureArtifactServer(artifactServer, taskManagerContainerSpec);
// ----------------- (4) start the actors -------------------
// 1) JobManager & Archive (in non-HA case, the leader service takes this)
// 2) Web Monitor (we need its port to register)
// 3) Resource Master for Mesos
// 4) Process reapers for the JobManager and Resource Master
// 1: the JobManager
LOG.debug("Starting JobManager actor");
// we start the JobManager with its standard name
ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, futureExecutor, ioExecutor, new scala.Some<>(JobManager.JOB_MANAGER_NAME()), scala.Option.<String>empty(), getJobManagerClass(), getArchivistClass())._1();
// 2: the web monitor
LOG.debug("Starting Web Frontend");
webMonitor = BootstrapTools.startWebMonitorIfConfigured(config, actorSystem, jobManager, LOG);
if (webMonitor != null) {
final URL webMonitorURL = new URL("http", appMasterHostname, webMonitor.getServerPort(), "/");
mesosConfig.frameworkInfo().setWebuiUrl(webMonitorURL.toExternalForm());
}
// 3: Flink's Mesos ResourceManager
LOG.debug("Starting Mesos Flink Resource Manager");
// create the worker store to persist task information across restarts
MesosWorkerStore workerStore = mesosServices.createMesosWorkerStore(config, ioExecutor);
// we need the leader retrieval service here to be informed of new
// leader session IDs, even though there can be only one leader ever
LeaderRetrievalService leaderRetriever = LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager);
Props resourceMasterProps = MesosFlinkResourceManager.createActorProps(getResourceManagerClass(), config, mesosConfig, workerStore, leaderRetriever, taskManagerParameters, taskManagerContainerSpec, artifactServer, LOG);
ActorRef resourceMaster = actorSystem.actorOf(resourceMasterProps, "Mesos_Resource_Master");
// 4: Process reapers
// The process reapers ensure that upon unexpected actor death, the process exits
// and does not stay lingering around unresponsive
LOG.debug("Starting process reapers for JobManager");
actorSystem.actorOf(Props.create(ProcessReaper.class, resourceMaster, LOG, ACTOR_DIED_EXIT_CODE), "Mesos_Resource_Master_Process_Reaper");
actorSystem.actorOf(Props.create(ProcessReaper.class, jobManager, LOG, ACTOR_DIED_EXIT_CODE), "JobManager_Process_Reaper");
} catch (Throwable t) {
// make sure that everything whatever ends up in the log
LOG.error("Mesos JobManager initialization failed", t);
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable ignored) {
LOG.warn("Failed to stop the web frontend", ignored);
}
}
if (artifactServer != null) {
try {
artifactServer.stop();
} catch (Throwable ignored) {
LOG.error("Failed to stop the artifact server", ignored);
}
}
if (actorSystem != null) {
try {
actorSystem.shutdown();
} catch (Throwable tt) {
LOG.error("Error shutting down actor system", tt);
}
}
if (futureExecutor != null) {
try {
futureExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down future executor", tt);
}
}
if (ioExecutor != null) {
try {
ioExecutor.shutdownNow();
} catch (Throwable tt) {
LOG.error("Error shutting down io executor", tt);
}
}
if (mesosServices != null) {
try {
mesosServices.close(false);
} catch (Throwable tt) {
LOG.error("Error closing the mesos services.", tt);
}
}
return INIT_ERROR_EXIT_CODE;
}
// everything started, we can wait until all is done or the process is killed
LOG.info("Mesos JobManager started");
// wait until everything is done
actorSystem.awaitTermination();
// if we get here, everything work out jolly all right, and we even exited smoothly
if (webMonitor != null) {
try {
webMonitor.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the web frontend", t);
}
}
try {
artifactServer.stop();
} catch (Throwable t) {
LOG.error("Failed to stop the artifact server", t);
}
org.apache.flink.runtime.concurrent.Executors.gracefulShutdown(AkkaUtils.getTimeout(config).toMillis(), TimeUnit.MILLISECONDS, futureExecutor, ioExecutor);
try {
mesosServices.close(true);
} catch (Throwable t) {
LOG.error("Failed to clean up and close MesosServices.", t);
}
return 0;
}
use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.
/**
* Tests the metric registry life cycle on JobManager re-connects.
*/
@Test
public void testMetricRegistryLifeCycle() throws Exception {
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
// ================================================================
// Start JobManager
// ================================================================
final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
// ================================================================
// Start TaskManager
// ================================================================
final Configuration config = new Configuration();
final ResourceID tmResourceID = ResourceID.generate();
TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
// create the task manager
final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
// wait for the TM to be registered
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
// trigger re-registration of TM; this should include a disconnect from the current JM
taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
// wait for re-registration to be completed
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// verify that the registry was not shutdown due to the disconnect
Assert.assertFalse(tmRegistry.isShutdown());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class YARNHighAvailabilityITCase method testMultipleAMKill.
/**
* Tests that the application master can be killed multiple times and that the surviving
* TaskManager successfully reconnects to the newly started JobManager.
* @throws Exception
*/
@Test
public void testMultipleAMKill() throws Exception {
final int numberKillingAttempts = numberApplicationAttempts - 1;
TestingYarnClusterDescriptor flinkYarnClient = new TestingYarnClusterDescriptor();
Assert.assertNotNull("unable to get yarn client", flinkYarnClient);
flinkYarnClient.setTaskManagerCount(1);
flinkYarnClient.setJobManagerMemory(768);
flinkYarnClient.setTaskManagerMemory(1024);
flinkYarnClient.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
flinkYarnClient.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
String confDirPath = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR);
flinkYarnClient.setConfigurationDirectory(confDirPath);
String fsStateHandlePath = temp.getRoot().getPath();
// load the configuration
File configDirectory = new File(confDirPath);
GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
flinkYarnClient.setFlinkConfiguration(GlobalConfiguration.loadConfiguration());
flinkYarnClient.setDynamicPropertiesEncoded("recovery.mode=zookeeper@@recovery.zookeeper.quorum=" + zkServer.getConnectString() + "@@yarn.application-attempts=" + numberApplicationAttempts + "@@" + CoreOptions.STATE_BACKEND + "=FILESYSTEM" + "@@" + FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY + "=" + fsStateHandlePath + "/checkpoints" + "@@" + HighAvailabilityOptions.HA_STORAGE_PATH.key() + "=" + fsStateHandlePath + "/recovery");
flinkYarnClient.setConfigurationFilePath(new Path(confDirPath + File.separator + "flink-conf.yaml"));
ClusterClient yarnCluster = null;
final FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
try {
yarnCluster = flinkYarnClient.deploy();
final Configuration config = yarnCluster.getFlinkConfiguration();
new JavaTestKit(actorSystem) {
{
for (int attempt = 0; attempt < numberKillingAttempts; attempt++) {
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway.leaderSessionID());
gateway.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
gateway.tell(PoisonPill.getInstance());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway2 = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway2.leaderSessionID());
gateway2.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
};
} finally {
if (yarnCluster != null) {
yarnCluster.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class SingleLeaderElectionServiceTest method testShutdown.
@Test
public void testShutdown() throws Exception {
final UUID uuid = UUID.randomUUID();
final SingleLeaderElectionService service = new SingleLeaderElectionService(executor, uuid);
// create a leader contender and let it grab leadership
final LeaderContender contender = mockContender(service);
service.start(contender);
verify(contender, times(1)).grantLeadership(uuid);
// some leader listeners
final LeaderRetrievalListener listener1 = mock(LeaderRetrievalListener.class);
final LeaderRetrievalListener listener2 = mock(LeaderRetrievalListener.class);
LeaderRetrievalService listenerService1 = service.createLeaderRetrievalService();
LeaderRetrievalService listenerService2 = service.createLeaderRetrievalService();
listenerService1.start(listener1);
listenerService2.start(listener2);
// one listener stops
listenerService1.stop();
// shut down the service
service.shutdown();
// the leader contender and running listener should get error notifications
verify(contender, times(1)).handleError(any(Exception.class));
verify(listener2, times(1)).handleError(any(Exception.class));
// the stopped listener gets no notification
verify(listener1, times(0)).handleError(any(Exception.class));
// should not be possible to start again after shutdown
try {
service.start(contender);
fail("should fail with an exception");
} catch (IllegalStateException e) {
// expected
}
// no additional leadership grant
verify(contender, times(1)).grantLeadership(any(UUID.class));
}
Aggregations