use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.
@Test
public void testJobManagerProcessFailure() throws Exception {
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Setup
// Test actor system
ActorSystem testActorSystem;
// Job managers
final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
// Task managers
final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
// Leader election service
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Coordination directory
coordinateTempDir = createTempDirectory();
// Job Managers
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Start first process
jmProcess[0] = new JobManagerProcess(0, config);
jmProcess[0].startProcess();
// Task manager configuration
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
}
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
// Wait for all task managers to connect to the leading job manager
JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
//start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
jmProcess[0].destroy();
jmProcess[1] = new JobManagerProcess(1, config);
jmProcess[1].startProcess();
jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
// we create the marker file which signals the program functions tasks that they can complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (JobManagerProcess p : jmProcess) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (tmActorSystem[i] != null) {
tmActorSystem[i].shutdown();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (JobManagerProcess jmProces : jmProcess) {
if (jmProces != null) {
jmProces.destroy();
}
}
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService in project flink by apache.
the class ChaosMonkeyITCase method testChaosMonkey.
@Test
public void testChaosMonkey() throws Exception {
// Test config
final int numberOfJobManagers = 3;
final int numberOfTaskManagers = 3;
final int numberOfSlotsPerTaskManager = 2;
// The final count each source is counting to: 1...n
final int n = 5000;
// Parallelism for the program
final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
// The test should not run longer than this
final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
// Every x seconds a random job or task manager is killed
//
// The job will will be running for $killEvery seconds and then a random Job/TaskManager
// will be killed. On recovery (which takes some time to bring up the new process etc.),
// this test will wait for task managers to reconnect before starting the next count down.
// Therefore the delay between retries is not important in this setup.
final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
// Trigger a checkpoint every
final int checkpointingIntervalMs = 1000;
// Total number of kills
final int totalNumberOfKills = 10;
// -----------------------------------------------------------------------------------------
// Setup
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
// Akka and restart timeouts
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
if (checkpointingIntervalMs >= killEvery.toMillis()) {
throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
}
// Task manager
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
ActorSystem testActorSystem = null;
LeaderRetrievalService leaderRetrievalService = null;
List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
try {
// Initial state
for (int i = 0; i < numberOfJobManagers; i++) {
jobManagerProcesses.add(createAndStartJobManagerProcess(config));
}
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerProcesses.add(createAndStartTaskManagerProcess(config));
}
testActorSystem = AkkaUtils.createDefaultActorSystem();
// Leader listener
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
TestingListener leaderListener = new TestingListener();
leaderRetrievalService.start(leaderListener);
Deadline deadline = testDuration.fromNow();
// Wait for the new leader
int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// The job
JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);
LOG.info("Submitting job {}", jobGraph.getJobID());
submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());
LOG.info("Waiting for a checkpoint to complete before kicking off chaos");
// Wait for a checkpoint to complete
TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());
LOG.info("Checkpoint completed... ready for chaos");
int currentKillNumber = 1;
int currentJobManagerKills = 0;
int currentTaskManagerKills = 0;
for (int i = 0; i < totalNumberOfKills; i++) {
LOG.info("Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
Thread.sleep(killEvery.toMillis());
LOG.info("Checking job status...");
JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
// Wait for it to run
LOG.info("Waiting for job status {}", JobStatus.RUNNING);
waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else if (jobStatus == JobStatus.FINISHED) {
// Early finish
LOG.info("Job finished");
return;
} else {
LOG.info("Job status is {}", jobStatus);
}
if (rand.nextBoolean()) {
LOG.info("Killing the leading JobManager");
JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
leader.destroy();
currentJobManagerKills++;
LOG.info("Killed {}", leader);
// Make sure to add the new job manager before looking for a new leader
jobManagerProcesses.add(newJobManager);
// Wait for the new leader
leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else {
LOG.info("Killing a random TaskManager");
TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
// Wait for this new task manager to be connected
waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// Now it's safe to kill a process
int next = rand.nextInt(numberOfTaskManagers);
TaskManagerProcess taskManager = taskManagerProcesses.remove(next);
LOG.info("{} has been chosen. Killing process...", taskManager);
taskManager.destroy();
currentTaskManagerKills++;
// Add the new task manager after killing an old one
taskManagerProcesses.add(newTaskManager);
}
}
LOG.info("Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
// Signal the job to speed up (if it is not done yet)
TestJvmProcess.touchFile(ProceedCoordination);
// Wait for the job to finish
LOG.info("Waiting for job status {}", JobStatus.FINISHED);
waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job finished");
LOG.info("Waiting for job removal");
waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job removed");
LOG.info("Checking clean recovery state...");
checkCleanRecoveryState(config);
LOG.info("Recovery state clean");
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
System.out.println("#################################################");
System.out.println(" TASK MANAGERS");
System.out.println("#################################################");
for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
taskManagerProcess.printProcessLog();
}
System.out.println("#################################################");
System.out.println(" JOB MANAGERS");
System.out.println("#################################################");
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
jobManagerProcess.printProcessLog();
}
throw t;
} finally {
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
if (jobManagerProcess != null) {
jobManagerProcess.destroy();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
Aggregations