use of org.apache.flink.runtime.testutils.TaskManagerProcess in project flink by apache.
the class ChaosMonkeyITCase method createAndStartTaskManagerProcess.
private TaskManagerProcess createAndStartTaskManagerProcess(Configuration config) throws Exception {
TaskManagerProcess taskManager = new TaskManagerProcess(taskManagerPid++, config);
taskManager.startProcess();
LOG.info("Created and started {}.", taskManager);
return taskManager;
}
use of org.apache.flink.runtime.testutils.TaskManagerProcess in project flink by apache.
the class ChaosMonkeyITCase method testChaosMonkey.
@Test
public void testChaosMonkey() throws Exception {
// Test config
final int numberOfJobManagers = 3;
final int numberOfTaskManagers = 3;
final int numberOfSlotsPerTaskManager = 2;
// The final count each source is counting to: 1...n
final int n = 5000;
// Parallelism for the program
final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
// The test should not run longer than this
final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
// Every x seconds a random job or task manager is killed
//
// The job will will be running for $killEvery seconds and then a random Job/TaskManager
// will be killed. On recovery (which takes some time to bring up the new process etc.),
// this test will wait for task managers to reconnect before starting the next count down.
// Therefore the delay between retries is not important in this setup.
final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
// Trigger a checkpoint every
final int checkpointingIntervalMs = 1000;
// Total number of kills
final int totalNumberOfKills = 10;
// -----------------------------------------------------------------------------------------
// Setup
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
// Akka and restart timeouts
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
if (checkpointingIntervalMs >= killEvery.toMillis()) {
throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
}
// Task manager
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
ActorSystem testActorSystem = null;
LeaderRetrievalService leaderRetrievalService = null;
List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
try {
// Initial state
for (int i = 0; i < numberOfJobManagers; i++) {
jobManagerProcesses.add(createAndStartJobManagerProcess(config));
}
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerProcesses.add(createAndStartTaskManagerProcess(config));
}
testActorSystem = AkkaUtils.createDefaultActorSystem();
// Leader listener
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
TestingListener leaderListener = new TestingListener();
leaderRetrievalService.start(leaderListener);
Deadline deadline = testDuration.fromNow();
// Wait for the new leader
int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// The job
JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);
LOG.info("Submitting job {}", jobGraph.getJobID());
submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());
LOG.info("Waiting for a checkpoint to complete before kicking off chaos");
// Wait for a checkpoint to complete
TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());
LOG.info("Checkpoint completed... ready for chaos");
int currentKillNumber = 1;
int currentJobManagerKills = 0;
int currentTaskManagerKills = 0;
for (int i = 0; i < totalNumberOfKills; i++) {
LOG.info("Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
Thread.sleep(killEvery.toMillis());
LOG.info("Checking job status...");
JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
// Wait for it to run
LOG.info("Waiting for job status {}", JobStatus.RUNNING);
waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else if (jobStatus == JobStatus.FINISHED) {
// Early finish
LOG.info("Job finished");
return;
} else {
LOG.info("Job status is {}", jobStatus);
}
if (rand.nextBoolean()) {
LOG.info("Killing the leading JobManager");
JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
leader.destroy();
currentJobManagerKills++;
LOG.info("Killed {}", leader);
// Make sure to add the new job manager before looking for a new leader
jobManagerProcesses.add(newJobManager);
// Wait for the new leader
leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else {
LOG.info("Killing a random TaskManager");
TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
// Wait for this new task manager to be connected
waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// Now it's safe to kill a process
int next = rand.nextInt(numberOfTaskManagers);
TaskManagerProcess taskManager = taskManagerProcesses.remove(next);
LOG.info("{} has been chosen. Killing process...", taskManager);
taskManager.destroy();
currentTaskManagerKills++;
// Add the new task manager after killing an old one
taskManagerProcesses.add(newTaskManager);
}
}
LOG.info("Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
// Signal the job to speed up (if it is not done yet)
TestJvmProcess.touchFile(ProceedCoordination);
// Wait for the job to finish
LOG.info("Waiting for job status {}", JobStatus.FINISHED);
waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job finished");
LOG.info("Waiting for job removal");
waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job removed");
LOG.info("Checking clean recovery state...");
checkCleanRecoveryState(config);
LOG.info("Recovery state clean");
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
System.out.println("#################################################");
System.out.println(" TASK MANAGERS");
System.out.println("#################################################");
for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
taskManagerProcess.printProcessLog();
}
System.out.println("#################################################");
System.out.println(" JOB MANAGERS");
System.out.println("#################################################");
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
jobManagerProcess.printProcessLog();
}
throw t;
} finally {
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
if (jobManagerProcess != null) {
jobManagerProcess.destroy();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
Aggregations