use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testJobPersistencyWhenJobManagerShutdown.
// ---------------------------------------------------------------------------------------------
/**
* Tests that the HA job is not cleaned up when the jobmanager is stopped.
*/
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Configure the cluster
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
TestingCluster flink = new TestingCluster(config, false, false);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Start the JobManager and TaskManager
flink.start(true);
JobGraph jobGraph = createBlockingJobGraph();
// Set restart strategy to guard against shut down races.
// If the TM fails before the JM, it might happen that the
// Job is failed, leading to state removal.
ExecutionConfig ec = new ExecutionConfig();
ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
jobGraph.setExecutionConfig(ec);
ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
// Submit the job
jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to start
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, jobManager, deadline.timeLeft());
} finally {
flink.shutdown();
}
// verify that the persisted job data has not been removed from ZooKeeper when the JM has
// been shutdown
verifyRecoveryState(config);
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testSubmitJobToNonLeader.
/**
* Tests that submissions to non-leaders are handled.
*/
@Test
public void testSubmitJobToNonLeader() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Configure the cluster
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
TestingCluster flink = new TestingCluster(config, false, false);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Start the JobManager and TaskManager
flink.start(true);
JobGraph jobGraph = createBlockingJobGraph();
List<ActorRef> bothJobManagers = flink.getJobManagersAsJava();
ActorGateway leadingJobManager = flink.getLeaderGateway(deadline.timeLeft());
ActorGateway nonLeadingJobManager;
if (bothJobManagers.get(0).equals(leadingJobManager.actor())) {
nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(1), null);
} else {
nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(0), null);
}
log.info("Leading job manager: " + leadingJobManager);
log.info("Non-leading job manager: " + nonLeadingJobManager);
// Submit the job
nonLeadingJobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
log.info("Submitted job graph to " + nonLeadingJobManager);
// Wait for the job to start. We are asking the *leading** JM here although we've
// submitted the job to the non-leading JM. This is the behaviour under test.
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leadingJobManager, deadline.timeLeft());
log.info("Wait that the non-leader removes the submitted job.");
// Make sure that the **non-leading** JM has actually removed the job graph from its
// local state.
boolean success = false;
while (!success && deadline.hasTimeLeft()) {
JobStatusResponse jobStatusResponse = JobManagerActorTestUtils.requestJobStatus(jobGraph.getJobID(), nonLeadingJobManager, deadline.timeLeft());
if (jobStatusResponse instanceof JobManagerMessages.JobNotFound) {
success = true;
} else {
log.info(((JobManagerMessages.CurrentJobStatus) jobStatusResponse).status().toString());
Thread.sleep(100);
}
}
if (!success) {
fail("Non-leading JM was still holding reference to the job graph.");
}
Future<Object> jobRemoved = leadingJobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
leadingJobManager.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
Await.ready(jobRemoved, deadline.timeLeft());
} finally {
flink.shutdown();
}
// Verify that everything is clean
verifyCleanRecoveryState(config);
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testTaskManagerRegistrationAtReelectedLeader.
/**
* Tests that the TaskManagers successfully register at the new leader once the old leader
* is terminated.
*/
@Test
public void testTaskManagerRegistrationAtReelectedLeader() throws Exception {
File rootFolder = tempFolder.getRoot();
Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
int numJMs = 10;
int numTMs = 3;
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, numJMs);
configuration.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
TestingCluster cluster = new TestingCluster(configuration);
try {
cluster.start();
for (int i = 0; i < numJMs; i++) {
ActorGateway leadingJM = cluster.getLeaderGateway(timeout);
cluster.waitForTaskManagersToBeRegisteredAtJobManager(leadingJM.actor());
Future<Object> registeredTMs = leadingJM.ask(JobManagerMessages.getRequestNumberRegisteredTaskManager(), timeout);
int numRegisteredTMs = (Integer) Await.result(registeredTMs, timeout);
assertEquals(numTMs, numRegisteredTMs);
cluster.clearLeader();
leadingJM.tell(PoisonPill.getInstance());
}
} finally {
cluster.stop();
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class TaskManagerRegistrationTest method testShutdownAfterRegistrationDurationExpired.
/**
* Tests that the TaskManager shuts down when it cannot register at the
* JobManager within the given maximum duration.
*
* Unfortunately, this test does not give good error messages.
* (I have not figured out how to get any better message out of the
* Akka TestKit than "ask timeout exception".)
*
* Anyways: An "ask timeout exception" here means that the TaskManager
* did not shut down after its registration timeout expired.
*/
@Test
public void testShutdownAfterRegistrationDurationExpired() {
new JavaTestKit(actorSystem) {
{
ActorGateway taskManager = null;
try {
// registration timeout of 1 second
Configuration tmConfig = new Configuration();
tmConfig.setString(ConfigConstants.TASK_MANAGER_MAX_REGISTRATION_DURATION, "500 ms");
// start the taskManager actor
taskManager = createTaskManager(actorSystem, JobManager.getLocalJobManagerAkkaURL(Option.<String>empty()), tmConfig, true, false);
// make sure it terminates in time, since it cannot register at a JobManager
watch(taskManager.actor());
final ActorGateway tm = taskManager;
new Within(timeout) {
@Override
protected void run() {
expectTerminated(tm.actor());
}
};
} catch (Throwable e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
stopActor(taskManager);
}
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class TaskManagerRegistrationTest method testTaskManagerNoExcessiveRegistrationMessages.
/**
* Tests that the TaskManager does not send an excessive amount of registration messages to
* the job manager if its registration was rejected.
*/
@Test
public void testTaskManagerNoExcessiveRegistrationMessages() throws Exception {
new JavaTestKit(actorSystem) {
{
ActorGateway jm = null;
ActorGateway taskManager = null;
try {
FiniteDuration timeout = new FiniteDuration(5, TimeUnit.SECONDS);
jm = TestingUtils.createForwardingActor(actorSystem, getTestActor(), Option.<String>empty());
final ActorGateway jmGateway = jm;
long refusedRegistrationPause = 500;
long initialRegistrationPause = 100;
long maxDelay = 30000;
Configuration tmConfig = new Configuration(config);
tmConfig.setString(ConfigConstants.TASK_MANAGER_REFUSED_REGISTRATION_PAUSE, refusedRegistrationPause + " ms");
tmConfig.setString(ConfigConstants.TASK_MANAGER_INITIAL_REGISTRATION_PAUSE, initialRegistrationPause + " ms");
// we make the test actor (the test kit) the JobManager to intercept
// the messages
taskManager = createTaskManager(actorSystem, jmGateway, tmConfig, true, false);
final ActorGateway taskManagerGateway = taskManager;
final Deadline deadline = timeout.fromNow();
try {
while (deadline.hasTimeLeft()) {
// the TaskManager should try to register
expectMsgClass(deadline.timeLeft(), RegisterTaskManager.class);
// we decline the registration
taskManagerGateway.tell(new RefuseRegistration(new Exception("test reason")), jmGateway);
}
} catch (AssertionError error) {
// ignore since it simply means that we have used up all our time
}
RegisterTaskManager[] registerTaskManagerMessages = new ReceiveWhile<RegisterTaskManager>(RegisterTaskManager.class, timeout) {
@Override
protected RegisterTaskManager match(Object msg) throws Exception {
if (msg instanceof RegisterTaskManager) {
return (RegisterTaskManager) msg;
} else {
throw noMatch();
}
}
}.get();
int maxExponent = (int) Math.floor(Math.log(((double) maxDelay / initialRegistrationPause + 1)) / Math.log(2));
int exponent = (int) Math.ceil(Math.log(((double) timeout.toMillis() / initialRegistrationPause + 1)) / Math.log(2));
int exp = Math.min(maxExponent, exponent);
long difference = timeout.toMillis() - (initialRegistrationPause * (1 << exp));
int numberRegisterTaskManagerMessages = exp;
if (difference > 0) {
numberRegisterTaskManagerMessages += Math.ceil((double) difference / maxDelay);
}
int maxExpectedNumberOfRegisterTaskManagerMessages = numberRegisterTaskManagerMessages * 2;
assertTrue("The number of RegisterTaskManager messages #" + registerTaskManagerMessages.length + " should be less than #" + maxExpectedNumberOfRegisterTaskManagerMessages, registerTaskManagerMessages.length <= maxExpectedNumberOfRegisterTaskManagerMessages);
} finally {
stopActor(taskManager);
stopActor(jm);
}
}
};
}
Aggregations