use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.
/**
* Simple checkpointed streaming sum.
*
* <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
* returns it to the main thread via a static variable. We wait until some checkpoints are
* completed and sanity check that the sources recover with an updated state to make sure that
* this test actually tests something.
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
// Config
final int checkpointingInterval = 200;
final int sequenceEnd = 5000;
final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
env.setParallelism(Parallelism);
env.enableCheckpointing(checkpointingInterval);
env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
ActorSystem testSystem = null;
final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
CompletedCheckpointsLatch.await();
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Wait to finish
FinalCountLatch.await();
assertEquals(expectedSum, (long) FinalCount.get());
for (int i = 0; i < Parallelism; i++) {
assertNotEquals(0, RecoveredStates.get(i));
}
} catch (Throwable t) {
// Reset all static state for test retries
CompletedCheckpointsLatch = new CountDownLatch(2);
RecoveredStates = new AtomicLongArray(Parallelism);
FinalCountLatch = new CountDownLatch(1);
FinalCount = new AtomicReference<>();
LastElement = -1;
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.
the class KubernetesMultipleComponentLeaderElectionDriverTest method testPublishLeaderInformation.
@Test
public void testPublishLeaderInformation() throws Exception {
new TestFixture() {
{
runTest(() -> {
leaderCallbackGrantLeadership();
leaderElectionListener.await(LeaderElectionEvent.IsLeaderEvent.class);
final LeaderInformation leaderInformation = LeaderInformation.known(UUID.randomUUID(), "localhost");
final String componentId = "componentId";
final DefaultLeaderRetrievalService leaderRetrievalService = new DefaultLeaderRetrievalService(new KubernetesMultipleComponentLeaderRetrievalDriverFactory(getFlinkKubeClient(), getConfigMapSharedWatcher(), testExecutorExtension.getExecutor(), LEADER_CONFIGMAP_NAME, componentId));
final TestingListener leaderRetrievalListener = new TestingListener();
leaderRetrievalService.start(leaderRetrievalListener);
leaderElectionDriver.publishLeaderInformation(componentId, leaderInformation);
notifyLeaderRetrievalWatchOnModifiedConfigMap();
leaderRetrievalListener.waitForNewLeader(10_000L);
assertThat(leaderRetrievalListener.getLeader()).isEqualTo(leaderInformation);
});
}
};
}
use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.
the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.
@Test
public void testDispatcherProcessFailure() throws Exception {
final Time timeout = Time.seconds(30L);
final File zookeeperStoragePath = temporaryFolder.newFolder();
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Job managers
final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
// Task managers
TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
HighAvailabilityServices highAvailabilityServices = null;
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
// Cluster config
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
// Task manager configuration
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
TaskExecutorResourceUtils.adjustForLocalExecution(config);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
try {
final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
// Coordination directory
coordinateTempDir = temporaryFolder.newFolder();
// Start first process
dispatcherProcesses[0] = new DispatcherProcess(0, config);
dispatcherProcesses[0].startProcess();
highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
taskManagerRunners[i].start();
}
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
// Wait for all task managers to connect to the leading job manager
waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
// start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
dispatcherProcesses[0].destroy();
dispatcherProcesses[1] = new DispatcherProcess(1, config);
dispatcherProcesses[1].startProcess();
// we create the marker file which signals the program functions tasks that they can
// complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (DispatcherProcess p : dispatcherProcesses) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (taskManagerRunners[i] != null) {
taskManagerRunners[i].close();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
if (dispatcherProcess != null) {
dispatcherProcess.destroy();
}
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
RpcUtils.terminateRpcService(rpcService, timeout);
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.
the class WebRuntimeMonitorITCase method testRedirectToLeader.
/**
* Tests that the monitor associated with the following job manager redirects to the leader.
*/
@Test
public void testRedirectToLeader() throws Exception {
final Deadline deadline = TestTimeout.fromNow();
ActorSystem[] jobManagerSystem = new ActorSystem[2];
WebRuntimeMonitor[] webMonitor = new WebRuntimeMonitor[2];
List<LeaderRetrievalService> leaderRetrievalServices = new ArrayList<>();
try (TestingServer zooKeeper = new TestingServer()) {
final Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), temporaryFolder.getRoot().getPath());
File logDir = temporaryFolder.newFolder();
Path logFile = Files.createFile(new File(logDir, "jobmanager.log").toPath());
Files.createFile(new File(logDir, "jobmanager.out").toPath());
config.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0);
config.setString(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY, logFile.toString());
for (int i = 0; i < jobManagerSystem.length; i++) {
jobManagerSystem[i] = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
}
for (int i = 0; i < webMonitor.length; i++) {
LeaderRetrievalService lrs = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalServices.add(lrs);
webMonitor[i] = new WebRuntimeMonitor(config, lrs, jobManagerSystem[i]);
}
ActorRef[] jobManager = new ActorRef[2];
String[] jobManagerAddress = new String[2];
for (int i = 0; i < jobManager.length; i++) {
Configuration jmConfig = config.clone();
jmConfig.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, webMonitor[i].getServerPort());
jobManager[i] = JobManager.startJobManagerActors(jmConfig, jobManagerSystem[i], TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
jobManagerAddress[i] = AkkaUtils.getAkkaURL(jobManagerSystem[i], jobManager[i]);
webMonitor[i].start(jobManagerAddress[i]);
}
LeaderRetrievalService lrs = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalServices.add(lrs);
TestingListener leaderListener = new TestingListener();
lrs.start(leaderListener);
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
int leaderIndex = leaderAddress.equals(jobManagerAddress[0]) ? 0 : 1;
int followerIndex = (leaderIndex + 1) % 2;
ActorSystem leadingSystem = jobManagerSystem[leaderIndex];
ActorSystem followerSystem = jobManagerSystem[followerIndex];
WebMonitor leadingWebMonitor = webMonitor[leaderIndex];
WebMonitor followerWebMonitor = webMonitor[followerIndex];
// For test stability reason we have to wait until we are sure that both leader
// listeners have been notified.
JobManagerRetriever leadingRetriever = Whitebox.getInternalState(leadingWebMonitor, "retriever");
JobManagerRetriever followerRetriever = Whitebox.getInternalState(followerWebMonitor, "retriever");
// Wait for the initial notifications
waitForLeaderNotification(leadingSystem, jobManager[leaderIndex], leadingRetriever, deadline);
waitForLeaderNotification(leadingSystem, jobManager[leaderIndex], followerRetriever, deadline);
try (HttpTestClient leaderClient = new HttpTestClient("localhost", leadingWebMonitor.getServerPort());
HttpTestClient followingClient = new HttpTestClient("localhost", followerWebMonitor.getServerPort())) {
String expected = new Scanner(new File(MAIN_RESOURCES_PATH + "/index.html")).useDelimiter("\\A").next();
// Request the file from the leading web server
leaderClient.sendGetRequest("index.html", deadline.timeLeft());
HttpTestClient.SimpleHttpResponse response = leaderClient.getNextResponse(deadline.timeLeft());
assertEquals(HttpResponseStatus.OK, response.getStatus());
assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("html"));
assertEquals(expected, response.getContent());
// Request the file from the following web server
followingClient.sendGetRequest("index.html", deadline.timeLeft());
response = followingClient.getNextResponse(deadline.timeLeft());
assertEquals(HttpResponseStatus.TEMPORARY_REDIRECT, response.getStatus());
assertTrue(response.getLocation().contains(String.valueOf(leadingWebMonitor.getServerPort())));
// Kill the leader
leadingSystem.shutdown();
// Wait for the notification of the follower
waitForLeaderNotification(followerSystem, jobManager[followerIndex], followerRetriever, deadline);
// Same request to the new leader
followingClient.sendGetRequest("index.html", deadline.timeLeft());
response = followingClient.getNextResponse(deadline.timeLeft());
assertEquals(HttpResponseStatus.OK, response.getStatus());
assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("html"));
assertEquals(expected, response.getContent());
// Simple overview request
followingClient.sendGetRequest("/overview", deadline.timeLeft());
response = followingClient.getNextResponse(deadline.timeLeft());
assertEquals(HttpResponseStatus.OK, response.getStatus());
assertEquals(response.getType(), MimeTypes.getMimeTypeForExtension("json"));
assertTrue(response.getContent().contains("\"taskmanagers\":1") || response.getContent().contains("\"taskmanagers\":0"));
}
} finally {
for (ActorSystem system : jobManagerSystem) {
if (system != null) {
system.shutdown();
}
}
for (WebMonitor monitor : webMonitor) {
monitor.stop();
}
for (LeaderRetrievalService lrs : leaderRetrievalServices) {
lrs.stop();
}
}
}
use of org.apache.flink.runtime.leaderelection.TestingListener in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.
/**
* Tests that the JobManager logs failures during recovery properly.
*
* @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
final Deadline testDeadline = TestTimeOut.fromNow();
final String zooKeeperQuorum = ZooKeeper.getConnectString();
final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
ActorSystem testActorSystem = null;
try {
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Get the leader
leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
JobManagerProcess nonLeadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
nonLeadingJobManagerProcess = jobManagerProcess[1];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
nonLeadingJobManagerProcess = jobManagerProcess[0];
}
// Blocking JobGraph
JobVertex blockingVertex = new JobVertex("Blocking vertex");
blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
JobGraph jobGraph = new JobGraph(blockingVertex);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to be running
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
// Remove all files
FileUtils.deleteDirectory(FileStateBackendBasePath);
// Kill the leader
leadingJobManagerProcess.destroy();
// Verify that the job manager logs the failed recovery. We can not
// do more at this point. :(
boolean success = false;
while (testDeadline.hasTimeLeft()) {
String output = nonLeadingJobManagerProcess.getProcessOutput();
if (output != null) {
if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
success = true;
break;
}
} else {
log.warn("No process output available.");
}
Thread.sleep(500);
}
assertTrue("Did not find expected output in logs.", success);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
Aggregations