the class JobClient method awaitJobResult.
* Given a JobListeningContext, awaits the result of the job execution that this context is bound to
* @param listeningContext The listening context of the job execution
* @return The result of the execution
* @throws JobExecutionException if anything goes wrong while monitoring the job
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
final JobID jobID = listeningContext.getJobID();
final ActorRef jobClientActor = listeningContext.getJobClientActor();
final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
final FiniteDuration askTimeout = listeningContext.getTimeout();
// retrieves class loader if necessary
final ClassLoader classLoader = listeningContext.getClassLoader();
// ping the JobClientActor from time to time to check if it is still running
while (!jobSubmissionFuture.isCompleted()) {
try {
Await.ready(jobSubmissionFuture, askTimeout);
} catch (InterruptedException e) {
throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
} catch (TimeoutException e) {
try {
Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
// we got a reply, continue waiting for the job result
} catch (Exception eInner) {
// thus the health check failed
if (!jobSubmissionFuture.isCompleted()) {
throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
final Object answer;
try {
// we have already awaited the result, zero time to wait here
answer = Await.result(jobSubmissionFuture, Duration.Zero());
} catch (Throwable throwable) {
throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
} finally {
// failsafe shutdown of the client actor
jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
// second block handles the actual response
if (answer instanceof JobManagerMessages.JobResultSuccess) {"Job execution complete");
SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
if (result != null) {
try {
return result.toJobExecutionResult(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
} else {
throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
} else if (answer instanceof JobManagerMessages.JobResultFailure) {"Job execution failed");
SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
if (serThrowable != null) {
Throwable cause = serThrowable.deserializeError(classLoader);
if (cause instanceof JobExecutionException) {
throw (JobExecutionException) cause;
} else {
throw new JobExecutionException(jobID, "Job execution failed", cause);
} else {
throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
} else if (answer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
} else {
throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
the class AkkaRpcService method connect.
// this method does not mutate state and is thus thread-safe
public <C extends RpcGateway> Future<C> connect(final String address, final Class<C> clazz) {
checkState(!stopped, "RpcService is stopped");
LOG.debug("Try to connect to remote RPC endpoint with address {}. Returning a {} gateway.", address, clazz.getName());
final ActorSelection actorSel = actorSystem.actorSelection(address);
final scala.concurrent.Future<Object> identify = Patterns.ask(actorSel, new Identify(42), timeout.toMilliseconds());
final scala.concurrent.Future<C> resultFuture = Mapper<Object, C>() {
public C checkedApply(Object obj) throws Exception {
ActorIdentity actorIdentity = (ActorIdentity) obj;
if (actorIdentity.getRef() == null) {
throw new RpcConnectionException("Could not connect to rpc endpoint under address " + address + '.');
} else {
ActorRef actorRef = actorIdentity.getRef();
final String address = AkkaUtils.getAkkaURL(actorSystem, actorRef);
final String hostname;
Option<String> host = actorRef.path().address().host();
if (host.isEmpty()) {
hostname = "localhost";
} else {
hostname = host.get();
InvocationHandler akkaInvocationHandler = new AkkaInvocationHandler(address, hostname, actorRef, timeout, maximumFramesize, null);
// Rather than using the System ClassLoader directly, we derive the ClassLoader
// from this class . That works better in cases where Flink runs embedded and all Flink
// code is loaded dynamically (for example from an OSGI bundle) through a custom ClassLoader
ClassLoader classLoader = AkkaRpcService.this.getClass().getClassLoader();
@SuppressWarnings("unchecked") C proxy = (C) Proxy.newProxyInstance(classLoader, new Class<?>[] { clazz }, akkaInvocationHandler);
return proxy;
}, actorSystem.dispatcher());
return new FlinkFuture<>(resultFuture);
the class JobManagerHARecoveryTest method testFailingJobRecovery.
* Tests that a failing job recovery won't cause other job recoveries to fail.
public void testFailingJobRecovery() throws Exception {
final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
final FiniteDuration jobRecoveryTimeout = new FiniteDuration(0, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(1, TimeUnit.MINUTES).fromNow();
final Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = null;
JobID jobId1 = new JobID();
JobID jobId2 = new JobID();
// set HA mode to zookeeper so that we try to recover jobs
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
try {
final SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
SubmittedJobGraph submittedJobGraph = mock(SubmittedJobGraph.class);
when(submittedJobGraphStore.getJobIds()).thenReturn(Arrays.asList(jobId1, jobId2));
// fail the first job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId1))).thenThrow(new Exception("Test exception"));
// succeed the second job recovery
final TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
final Collection<JobID> recoveredJobs = new ArrayList<>(2);
Props jobManagerProps = Props.create(TestingFailingHAJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), mock(InstanceManager.class), mock(Scheduler.class), new BlobLibraryCacheManager(mock(BlobService.class), 1 << 20), ActorRef.noSender(), new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, submittedJobGraphStore, mock(CheckpointRecoveryFactory.class), jobRecoveryTimeout, Option.<MetricRegistry>apply(null), recoveredJobs).withDispatcher(CallingThreadDispatcher.Id());
jobManager = system.actorOf(jobManagerProps);
Future<Object> started = Patterns.ask(jobManager, new Identify(42), deadline.timeLeft().toMillis());
Await.ready(started, deadline.timeLeft());
// make the job manager the leader --> this triggers the recovery of all jobs
// check that we have successfully recovered the second job
assertThat(recoveredJobs, containsInAnyOrder(jobId2));
} finally {
the class TestActorFactory method verifyActorReady.
private void verifyActorReady(ActorRef actorRef) {
// Sometimes we see messages go to dead letters soon after creation - it seems the actor isn't quite
// in a state yet to receive messages or isn't actually created yet. This seems to happen with
// actorSelection so, to alleviate it, we use an actorSelection and send an Identify message with
// retries to ensure it's ready.
Timeout timeout = new Timeout(100, TimeUnit.MILLISECONDS);
Throwable lastError = null;
Stopwatch sw = Stopwatch.createStarted();
while (sw.elapsed(TimeUnit.SECONDS) <= 10) {
try {
ActorSelection actorSelection = system.actorSelection(actorRef.path().toString());
Future<Object> future = Patterns.ask(actorSelection, new Identify(""), timeout);
ActorIdentity reply = (ActorIdentity) Await.result(future, timeout.duration());
Assert.assertNotNull("Identify returned null", reply.getRef());
} catch (Exception | AssertionError e) {
Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
lastError = e;
throw new RuntimeException(lastError);