use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished in project flink by apache.
the class LeaderChangeStateCleanupTest method testReelectionOfSameJobManager.
/**
* Tests that the same JobManager can be reelected as the leader. Even though, the same JM
* is elected as the next leader, all currently running jobs should be canceled properly and
* all TMs should disconnect from the leader and then reconnect to it.
*/
@Test
public void testReelectionOfSameJobManager() throws Exception {
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
FiniteDuration shortTimeout = new FiniteDuration(10, TimeUnit.SECONDS);
cluster.grantLeadership(0, leaderSessionID);
cluster.notifyRetrievalListeners(0, leaderSessionID);
cluster.waitForTaskManagersToBeRegistered(timeout);
// submit blocking job
cluster.submitJobDetached(job);
ActorGateway jm = cluster.getLeaderGateway(timeout);
Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
Await.ready(wait, timeout);
Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
LOG.info("Make JM(0) again the leader. This should first revoke the leadership.");
// make JM(0) again the leader --> this implies first a leadership revocation
cluster.grantLeadership(0, newLeaderSessionID);
Await.ready(jobRemoval, timeout);
LOG.info("Job removed.");
// session ID
try {
cluster.waitForTaskManagersToBeRegistered(shortTimeout);
fail("TaskManager should not be able to register at JobManager.");
} catch (TimeoutException e) {
// expected exception since the TMs have still the old leader session ID
}
LOG.info("Notify TMs about the new (old) leader.");
// notify the TMs about the new (old) leader
cluster.notifyRetrievalListeners(0, newLeaderSessionID);
cluster.waitForTaskManagersToBeRegistered(timeout);
ActorGateway leaderGateway = cluster.getLeaderGateway(timeout);
// try to resubmit now the non-blocking job, it should complete successfully
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
cluster.submitJobAndWait(job, false, timeout, new TestingLeaderRetrievalService(leaderGateway.path(), leaderGateway.leaderSessionID()));
}
use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished in project flink by apache.
the class LeaderChangeStateCleanupTest method testStateCleanupAfterNewLeaderElectionAndListenerNotification.
/**
* Tests that a job is properly canceled in the case of a leader change. In such an event all
* TaskManagers have to disconnect from the previous leader and connect to the newly elected
* leader.
*/
@Test
public void testStateCleanupAfterNewLeaderElectionAndListenerNotification() throws Exception {
UUID leaderSessionID1 = UUID.randomUUID();
UUID leaderSessionID2 = UUID.randomUUID();
// first make JM(0) the leader
cluster.grantLeadership(0, leaderSessionID1);
// notify all listeners
cluster.notifyRetrievalListeners(0, leaderSessionID1);
cluster.waitForTaskManagersToBeRegistered(timeout);
// submit blocking job so that it is not finished when we cancel it
cluster.submitJobDetached(job);
ActorGateway jm = cluster.getLeaderGateway(timeout);
Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
Await.ready(wait, timeout);
Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
// make the JM(1) the new leader
cluster.grantLeadership(1, leaderSessionID2);
// notify all listeners about the event
cluster.notifyRetrievalListeners(1, leaderSessionID2);
Await.ready(jobRemoval, timeout);
cluster.waitForTaskManagersToBeRegistered(timeout);
ActorGateway jm2 = cluster.getLeaderGateway(timeout);
Future<Object> futureNumberSlots = jm2.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), timeout);
// check that all TMs have registered at the new leader
int numberSlots = (Integer) Await.result(futureNumberSlots, timeout);
assertEquals(parallelism, numberSlots);
// try to resubmit now the non-blocking job, it should complete successfully
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
cluster.submitJobAndWait(job, false, timeout, new TestingLeaderRetrievalService(jm2.path(), jm2.leaderSessionID()));
}
use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderReelection.
/**
* Tests that a job can be executed after a new leader has been elected. For all except for the
* last leader, the job is blocking. The JobManager will be terminated while executing the
* blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
* successfully executed.
*/
@Test
public void testJobExecutionOnClusterWithLeaderReelection() throws Exception {
int numJMs = 10;
int numTMs = 2;
int numSlotsPerTM = 3;
int parallelism = numTMs * numSlotsPerTM;
File rootFolder = tempFolder.getRoot();
Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTM);
// we "effectively" disable the automatic RecoverAllJobs message and sent it manually to make
// sure that all TMs have registered to the JM prior to issueing the RecoverAllJobs message
configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, AkkaUtils.INF_TIMEOUT().toString());
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(true);
JobVertex sender = new JobVertex("sender");
JobVertex receiver = new JobVertex("receiver");
sender.setInvokableClass(Tasks.Sender.class);
receiver.setInvokableClass(Tasks.BlockingOnceReceiver.class);
sender.setParallelism(parallelism);
receiver.setParallelism(parallelism);
receiver.connectNewDataSetAsInput(sender, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
SlotSharingGroup slotSharingGroup = new SlotSharingGroup();
sender.setSlotSharingGroup(slotSharingGroup);
receiver.setSlotSharingGroup(slotSharingGroup);
final JobGraph graph = new JobGraph("Blocking test job", sender, receiver);
final TestingCluster cluster = new TestingCluster(configuration);
ActorSystem clientActorSystem = null;
Thread thread = null;
JobSubmitterRunnable jobSubmission = null;
try {
cluster.start();
clientActorSystem = cluster.startJobClientActorSystem(graph.getJobID());
final ActorSystem clientAS = clientActorSystem;
jobSubmission = new JobSubmitterRunnable(clientAS, cluster, graph);
thread = new Thread(jobSubmission);
thread.start();
Deadline deadline = timeout.$times(3).fromNow();
// Kill all JobManager except for two
for (int i = 0; i < numJMs; i++) {
ActorGateway jm = cluster.getLeaderGateway(deadline.timeLeft());
cluster.waitForTaskManagersToBeRegisteredAtJobManager(jm.actor());
// recover all jobs, sent manually
log.info("Sent recover all jobs manually to job manager {}.", jm.path());
jm.tell(JobManagerMessages.getRecoverAllJobs());
if (i < numJMs - 1) {
Future<Object> future = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(graph.getJobID()), deadline.timeLeft());
Await.ready(future, deadline.timeLeft());
cluster.clearLeader();
if (i == numJMs - 2) {
Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
}
log.info("Kill job manager {}.", jm.path());
jm.tell(TestingJobManagerMessages.getDisablePostStop());
jm.tell(Kill.getInstance());
}
}
log.info("Waiting for submitter thread to terminate.");
thread.join(deadline.timeLeft().toMillis());
log.info("Submitter thread has terminated.");
if (thread.isAlive()) {
fail("The job submission thread did not stop (meaning it did not succeeded in" + "executing the test job.");
}
Await.result(jobSubmission.resultPromise.future(), deadline.timeLeft());
} finally {
if (clientActorSystem != null) {
cluster.shutdownJobClientActorSystem(clientActorSystem);
}
if (thread != null && thread.isAlive()) {
jobSubmission.finished = true;
}
cluster.stop();
}
}
Aggregations