use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.
/**
* Tests that the JobManager logs failures during recovery properly.
*
* @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
final Deadline testDeadline = TestTimeOut.fromNow();
final String zooKeeperQuorum = ZooKeeper.getConnectString();
final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
ActorSystem testActorSystem = null;
try {
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Get the leader
leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
JobManagerProcess nonLeadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
nonLeadingJobManagerProcess = jobManagerProcess[1];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
nonLeadingJobManagerProcess = jobManagerProcess[0];
}
// Blocking JobGraph
JobVertex blockingVertex = new JobVertex("Blocking vertex");
blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
JobGraph jobGraph = new JobGraph(blockingVertex);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to be running
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
// Remove all files
FileUtils.deleteDirectory(FileStateBackendBasePath);
// Kill the leader
leadingJobManagerProcess.destroy();
// Verify that the job manager logs the failed recovery. We can not
// do more at this point. :(
boolean success = false;
while (testDeadline.hasTimeLeft()) {
String output = nonLeadingJobManagerProcess.getProcessOutput();
if (output != null) {
if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
success = true;
break;
}
} else {
log.warn("No process output available.");
}
Thread.sleep(500);
}
assertTrue("Did not find expected output in logs.", success);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.
/**
* Tests that clients receive updates after recovery by a new leader.
*/
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Test actor system
ActorSystem testSystem = null;
// JobManager setup. Start the job managers as separate processes in order to not run the
// actors postStop, which cleans up all running jobs.
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Client test actor
TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
JobGraph jobGraph = createBlockingJobGraph();
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// The client
AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
int numSlots = 0;
while (numSlots == 0) {
Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
}
// Submit the job in non-detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
// Cancel the job
leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
}
// Wait for the execution result
clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
int jobSubmitSuccessMessages = 0;
for (Object msg : clientRef.underlyingActor().getMessages()) {
if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
jobSubmitSuccessMessages++;
}
}
// At least two submissions should be ack-ed (initial and recovery). This is quite
// conservative, but it is still possible that these messages are overtaken by the
// final message.
assertEquals(2, jobSubmitSuccessMessages);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.
@Test
public void testJobManagerProcessFailure() throws Exception {
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Setup
// Test actor system
ActorSystem testActorSystem;
// Job managers
final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
// Task managers
final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
// Leader election service
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Coordination directory
coordinateTempDir = createTempDirectory();
// Job Managers
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Start first process
jmProcess[0] = new JobManagerProcess(0, config);
jmProcess[0].startProcess();
// Task manager configuration
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
}
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
// Wait for all task managers to connect to the leading job manager
JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
//start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
jmProcess[0].destroy();
jmProcess[1] = new JobManagerProcess(1, config);
jmProcess[1].startProcess();
jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
// we create the marker file which signals the program functions tasks that they can complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (JobManagerProcess p : jmProcess) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (tmActorSystem[i] != null) {
tmActorSystem[i].shutdown();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (JobManagerProcess jmProces : jmProcess) {
if (jmProces != null) {
jmProces.destroy();
}
}
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class ChaosMonkeyITCase method testChaosMonkey.
@Test
public void testChaosMonkey() throws Exception {
// Test config
final int numberOfJobManagers = 3;
final int numberOfTaskManagers = 3;
final int numberOfSlotsPerTaskManager = 2;
// The final count each source is counting to: 1...n
final int n = 5000;
// Parallelism for the program
final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
// The test should not run longer than this
final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
// Every x seconds a random job or task manager is killed
//
// The job will will be running for $killEvery seconds and then a random Job/TaskManager
// will be killed. On recovery (which takes some time to bring up the new process etc.),
// this test will wait for task managers to reconnect before starting the next count down.
// Therefore the delay between retries is not important in this setup.
final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
// Trigger a checkpoint every
final int checkpointingIntervalMs = 1000;
// Total number of kills
final int totalNumberOfKills = 10;
// -----------------------------------------------------------------------------------------
// Setup
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
// Akka and restart timeouts
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
if (checkpointingIntervalMs >= killEvery.toMillis()) {
throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
}
// Task manager
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
ActorSystem testActorSystem = null;
LeaderRetrievalService leaderRetrievalService = null;
List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
try {
// Initial state
for (int i = 0; i < numberOfJobManagers; i++) {
jobManagerProcesses.add(createAndStartJobManagerProcess(config));
}
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerProcesses.add(createAndStartTaskManagerProcess(config));
}
testActorSystem = AkkaUtils.createDefaultActorSystem();
// Leader listener
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
TestingListener leaderListener = new TestingListener();
leaderRetrievalService.start(leaderListener);
Deadline deadline = testDuration.fromNow();
// Wait for the new leader
int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// The job
JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);
LOG.info("Submitting job {}", jobGraph.getJobID());
submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());
LOG.info("Waiting for a checkpoint to complete before kicking off chaos");
// Wait for a checkpoint to complete
TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());
LOG.info("Checkpoint completed... ready for chaos");
int currentKillNumber = 1;
int currentJobManagerKills = 0;
int currentTaskManagerKills = 0;
for (int i = 0; i < totalNumberOfKills; i++) {
LOG.info("Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
Thread.sleep(killEvery.toMillis());
LOG.info("Checking job status...");
JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
// Wait for it to run
LOG.info("Waiting for job status {}", JobStatus.RUNNING);
waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else if (jobStatus == JobStatus.FINISHED) {
// Early finish
LOG.info("Job finished");
return;
} else {
LOG.info("Job status is {}", jobStatus);
}
if (rand.nextBoolean()) {
LOG.info("Killing the leading JobManager");
JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
leader.destroy();
currentJobManagerKills++;
LOG.info("Killed {}", leader);
// Make sure to add the new job manager before looking for a new leader
jobManagerProcesses.add(newJobManager);
// Wait for the new leader
leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else {
LOG.info("Killing a random TaskManager");
TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
// Wait for this new task manager to be connected
waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// Now it's safe to kill a process
int next = rand.nextInt(numberOfTaskManagers);
TaskManagerProcess taskManager = taskManagerProcesses.remove(next);
LOG.info("{} has been chosen. Killing process...", taskManager);
taskManager.destroy();
currentTaskManagerKills++;
// Add the new task manager after killing an old one
taskManagerProcesses.add(newTaskManager);
}
}
LOG.info("Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
// Signal the job to speed up (if it is not done yet)
TestJvmProcess.touchFile(ProceedCoordination);
// Wait for the job to finish
LOG.info("Waiting for job status {}", JobStatus.FINISHED);
waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job finished");
LOG.info("Waiting for job removal");
waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
LOG.info("Job removed");
LOG.info("Checking clean recovery state...");
checkCleanRecoveryState(config);
LOG.info("Recovery state clean");
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
System.out.println("#################################################");
System.out.println(" TASK MANAGERS");
System.out.println("#################################################");
for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
taskManagerProcess.printProcessLog();
}
System.out.println("#################################################");
System.out.println(" JOB MANAGERS");
System.out.println("#################################################");
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
jobManagerProcess.printProcessLog();
}
throw t;
} finally {
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
if (jobManagerProcess != null) {
jobManagerProcess.destroy();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderReelection.
/**
* Tests that a job can be executed after a new leader has been elected. For all except for the
* last leader, the job is blocking. The JobManager will be terminated while executing the
* blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
* successfully executed.
*/
@Test
public void testJobExecutionOnClusterWithLeaderReelection() throws Exception {
int numJMs = 10;
int numTMs = 2;
int numSlotsPerTM = 3;
int parallelism = numTMs * numSlotsPerTM;
File rootFolder = tempFolder.getRoot();
Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTM);
// we "effectively" disable the automatic RecoverAllJobs message and sent it manually to make
// sure that all TMs have registered to the JM prior to issueing the RecoverAllJobs message
configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, AkkaUtils.INF_TIMEOUT().toString());
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(true);
JobVertex sender = new JobVertex("sender");
JobVertex receiver = new JobVertex("receiver");
sender.setInvokableClass(Tasks.Sender.class);
receiver.setInvokableClass(Tasks.BlockingOnceReceiver.class);
sender.setParallelism(parallelism);
receiver.setParallelism(parallelism);
receiver.connectNewDataSetAsInput(sender, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slotSharingGroup = new SlotSharingGroup();
sender.setSlotSharingGroup(slotSharingGroup);
receiver.setSlotSharingGroup(slotSharingGroup);
final JobGraph graph = new JobGraph("Blocking test job", sender, receiver);
final TestingCluster cluster = new TestingCluster(configuration);
ActorSystem clientActorSystem = null;
Thread thread = null;
JobSubmitterRunnable jobSubmission = null;
try {
cluster.start();
clientActorSystem = cluster.startJobClientActorSystem(graph.getJobID());
final ActorSystem clientAS = clientActorSystem;
jobSubmission = new JobSubmitterRunnable(clientAS, cluster, graph);
thread = new Thread(jobSubmission);
thread.start();
Deadline deadline = timeout.$times(3).fromNow();
// Kill all JobManager except for two
for (int i = 0; i < numJMs; i++) {
ActorGateway jm = cluster.getLeaderGateway(deadline.timeLeft());
cluster.waitForTaskManagersToBeRegisteredAtJobManager(jm.actor());
// recover all jobs, sent manually
log.info("Sent recover all jobs manually to job manager {}.", jm.path());
jm.tell(JobManagerMessages.getRecoverAllJobs());
if (i < numJMs - 1) {
Future<Object> future = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(graph.getJobID()), deadline.timeLeft());
Await.ready(future, deadline.timeLeft());
cluster.clearLeader();
if (i == numJMs - 2) {
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
}
log.info("Kill job manager {}.", jm.path());
jm.tell(TestingJobManagerMessages.getDisablePostStop());
jm.tell(Kill.getInstance());
}
}
log.info("Waiting for submitter thread to terminate.");
thread.join(deadline.timeLeft().toMillis());
log.info("Submitter thread has terminated.");
if (thread.isAlive()) {
fail("The job submission thread did not stop (meaning it did not succeeded in" + "executing the test job.");
}
Await.result(jobSubmission.resultPromise.future(), deadline.timeLeft());
} finally {
if (clientActorSystem != null) {
cluster.shutdownJobClientActorSystem(clientActorSystem);
}
if (thread != null && thread.isAlive()) {
jobSubmission.finished = true;
}
cluster.stop();
}
}
Aggregations