Search in sources :

Example 1 with NotifyWhenJobRemoved

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved in project flink by apache.

the class LeaderChangeStateCleanupTest method testStateCleanupAfterNewLeaderElection.

/**
	 * Tests that a job is properly canceled in the case of a leader change. However, this time only the
	 * JMs are notified about the leader change and the TMs still believe the old leader to have
	 * leadership.
	 */
@Test
public void testStateCleanupAfterNewLeaderElection() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    // submit blocking job so that we can test job clean up
    cluster.submitJobDetached(job);
    ActorGateway jm = cluster.getLeaderGateway(timeout);
    Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
    Await.ready(wait, timeout);
    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
    // only notify the JMs about the new leader JM(1)
    cluster.grantLeadership(1, newLeaderSessionID);
    // job should be removed anyway
    Await.ready(jobRemoval, timeout);
}
Also used : WaitForAllVerticesToBeRunningOrFinished(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished) NotifyWhenJobRemoved(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) UUID(java.util.UUID) Test(org.junit.Test)

Example 2 with NotifyWhenJobRemoved

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved in project flink by apache.

the class LeaderChangeStateCleanupTest method testStateCleanupAfterListenerNotification.

/**
	 * Tests that a job is properly canceled in the event of a leader change. However, this time
	 * only the TMs are notified about the changing leader. This should be enough to cancel the
	 * currently running job, though.
	 */
@Test
public void testStateCleanupAfterListenerNotification() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    // submit blocking job
    cluster.submitJobDetached(job);
    ActorGateway jm = cluster.getLeaderGateway(timeout);
    Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
    Await.ready(wait, timeout);
    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
    // notify listeners (TMs) about the leader change
    cluster.notifyRetrievalListeners(1, newLeaderSessionID);
    Await.ready(jobRemoval, timeout);
}
Also used : WaitForAllVerticesToBeRunningOrFinished(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished) NotifyWhenJobRemoved(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) UUID(java.util.UUID) Test(org.junit.Test)

Example 3 with NotifyWhenJobRemoved

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved in project flink by apache.

the class LeaderChangeStateCleanupTest method testReelectionOfSameJobManager.

/**
	 * Tests that the same JobManager can be reelected as the leader. Even though, the same JM
	 * is elected as the next leader, all currently running jobs should be canceled properly and
	 * all TMs should disconnect from the leader and then reconnect to it.
	 */
@Test
public void testReelectionOfSameJobManager() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    FiniteDuration shortTimeout = new FiniteDuration(10, TimeUnit.SECONDS);
    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    // submit blocking job
    cluster.submitJobDetached(job);
    ActorGateway jm = cluster.getLeaderGateway(timeout);
    Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
    Await.ready(wait, timeout);
    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
    LOG.info("Make JM(0) again the leader. This should first revoke the leadership.");
    // make JM(0) again the leader --> this implies first a leadership revocation
    cluster.grantLeadership(0, newLeaderSessionID);
    Await.ready(jobRemoval, timeout);
    LOG.info("Job removed.");
    // session ID
    try {
        cluster.waitForTaskManagersToBeRegistered(shortTimeout);
        fail("TaskManager should not be able to register at JobManager.");
    } catch (TimeoutException e) {
    // expected exception since the TMs have still the old leader session ID
    }
    LOG.info("Notify TMs about the new (old) leader.");
    // notify the TMs about the new (old) leader
    cluster.notifyRetrievalListeners(0, newLeaderSessionID);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    ActorGateway leaderGateway = cluster.getLeaderGateway(timeout);
    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout, new TestingLeaderRetrievalService(leaderGateway.path(), leaderGateway.leaderSessionID()));
}
Also used : WaitForAllVerticesToBeRunningOrFinished(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished) NotifyWhenJobRemoved(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) FiniteDuration(scala.concurrent.duration.FiniteDuration) UUID(java.util.UUID) TimeoutException(java.util.concurrent.TimeoutException) Test(org.junit.Test)

Example 4 with NotifyWhenJobRemoved

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved in project flink by apache.

the class LeaderChangeStateCleanupTest method testStateCleanupAfterNewLeaderElectionAndListenerNotification.

/**
	 * Tests that a job is properly canceled in the case of a leader change. In such an event all
	 * TaskManagers have to disconnect from the previous leader and connect to the newly elected
	 * leader.
	 */
@Test
public void testStateCleanupAfterNewLeaderElectionAndListenerNotification() throws Exception {
    UUID leaderSessionID1 = UUID.randomUUID();
    UUID leaderSessionID2 = UUID.randomUUID();
    // first make JM(0) the leader
    cluster.grantLeadership(0, leaderSessionID1);
    // notify all listeners
    cluster.notifyRetrievalListeners(0, leaderSessionID1);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    // submit blocking job so that it is not finished when we cancel it
    cluster.submitJobDetached(job);
    ActorGateway jm = cluster.getLeaderGateway(timeout);
    Future<Object> wait = jm.ask(new WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
    Await.ready(wait, timeout);
    Future<Object> jobRemoval = jm.ask(new NotifyWhenJobRemoved(job.getJobID()), timeout);
    // make the JM(1) the new leader
    cluster.grantLeadership(1, leaderSessionID2);
    // notify all listeners about the event
    cluster.notifyRetrievalListeners(1, leaderSessionID2);
    Await.ready(jobRemoval, timeout);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    ActorGateway jm2 = cluster.getLeaderGateway(timeout);
    Future<Object> futureNumberSlots = jm2.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), timeout);
    // check that all TMs have registered at the new leader
    int numberSlots = (Integer) Await.result(futureNumberSlots, timeout);
    assertEquals(parallelism, numberSlots);
    // try to resubmit now the non-blocking job, it should complete successfully
    Tasks.BlockingOnceReceiver$.MODULE$.blocking_$eq(false);
    cluster.submitJobAndWait(job, false, timeout, new TestingLeaderRetrievalService(jm2.path(), jm2.leaderSessionID()));
}
Also used : WaitForAllVerticesToBeRunningOrFinished(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished) NotifyWhenJobRemoved(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) UUID(java.util.UUID) Test(org.junit.Test)

Aggregations

UUID (java.util.UUID)4 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)4 NotifyWhenJobRemoved (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobRemoved)4 WaitForAllVerticesToBeRunningOrFinished (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished)4 Test (org.junit.Test)4 TimeoutException (java.util.concurrent.TimeoutException)1 FiniteDuration (scala.concurrent.duration.FiniteDuration)1