use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.
/**
* Tests the metric registry life cycle on JobManager re-connects.
*/
@Test
public void testMetricRegistryLifeCycle() throws Exception {
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
// ================================================================
// Start JobManager
// ================================================================
final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
// ================================================================
// Start TaskManager
// ================================================================
final Configuration config = new Configuration();
final ResourceID tmResourceID = ResourceID.generate();
TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
// create the task manager
final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
// wait for the TM to be registered
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
// trigger re-registration of TM; this should include a disconnect from the current JM
taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
// wait for re-registration to be completed
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// verify that the registry was not shutdown due to the disconnect
Assert.assertFalse(tmRegistry.isShutdown());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class JobClient method attachToRunningJob.
/**
* Attaches to a running Job using the JobID.
* Reconstructs the user class loader by downloading the jars from the JobManager.
*/
public static JobListeningContext attachToRunningJob(JobID jobID, ActorGateway jobManagerGateWay, Configuration configuration, ActorSystem actorSystem, LeaderRetrievalService leaderRetrievalService, FiniteDuration timeout, boolean sysoutLogUpdates) {
checkNotNull(jobID, "The jobID must not be null.");
checkNotNull(jobManagerGateWay, "The jobManagerGateWay must not be null.");
checkNotNull(configuration, "The configuration must not be null.");
checkNotNull(actorSystem, "The actorSystem must not be null.");
checkNotNull(leaderRetrievalService, "The jobManagerGateway must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
// we create a proxy JobClientActor that deals with all communication with
// the JobManager. It forwards the job attachments, checks the success/failure responses, logs
// update messages, watches for disconnect between client and JobManager, ...
Props jobClientActorProps = JobAttachmentClientActor.createActorProps(leaderRetrievalService, timeout, sysoutLogUpdates);
ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
Future<Object> attachmentFuture = Patterns.ask(jobClientActor, new JobClientMessages.AttachToJobAndWait(jobID), new Timeout(AkkaUtils.INF_TIMEOUT()));
return new JobListeningContext(jobID, attachmentFuture, jobClientActor, timeout, actorSystem, configuration);
}
use of akka.actor.ActorRef in project flink by apache.
the class JobClient method submitJob.
/**
* Submits a job to a Flink cluster (non-blocking) and returns a JobListeningContext which can be
* passed to {@code awaitJobResult} to get the result of the submission.
* @return JobListeningContext which may be used to retrieve the JobExecutionResult via
* {@code awaitJobResult(JobListeningContext context)}.
*/
public static JobListeningContext submitJob(ActorSystem actorSystem, Configuration config, LeaderRetrievalService leaderRetrievalService, JobGraph jobGraph, FiniteDuration timeout, boolean sysoutLogUpdates, ClassLoader classLoader) {
checkNotNull(actorSystem, "The actorSystem must not be null.");
checkNotNull(leaderRetrievalService, "The jobManagerGateway must not be null.");
checkNotNull(jobGraph, "The jobGraph must not be null.");
checkNotNull(timeout, "The timeout must not be null.");
// for this job, we create a proxy JobClientActor that deals with all communication with
// the JobManager. It forwards the job submission, checks the success/failure responses, logs
// update messages, watches for disconnect between client and JobManager, ...
Props jobClientActorProps = JobSubmissionClientActor.createActorProps(leaderRetrievalService, timeout, sysoutLogUpdates, config);
ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
Future<Object> submissionFuture = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(jobGraph), new Timeout(AkkaUtils.INF_TIMEOUT()));
return new JobListeningContext(jobGraph.getJobID(), submissionFuture, jobClientActor, timeout, classLoader);
}
use of akka.actor.ActorRef in project flink by apache.
the class JobClient method awaitJobResult.
/**
* Given a JobListeningContext, awaits the result of the job execution that this context is bound to
* @param listeningContext The listening context of the job execution
* @return The result of the execution
* @throws JobExecutionException if anything goes wrong while monitoring the job
*/
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
final JobID jobID = listeningContext.getJobID();
final ActorRef jobClientActor = listeningContext.getJobClientActor();
final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
final FiniteDuration askTimeout = listeningContext.getTimeout();
// retrieves class loader if necessary
final ClassLoader classLoader = listeningContext.getClassLoader();
// ping the JobClientActor from time to time to check if it is still running
while (!jobSubmissionFuture.isCompleted()) {
try {
Await.ready(jobSubmissionFuture, askTimeout);
} catch (InterruptedException e) {
throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
} catch (TimeoutException e) {
try {
Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
// we got a reply, continue waiting for the job result
} catch (Exception eInner) {
// thus the health check failed
if (!jobSubmissionFuture.isCompleted()) {
throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
}
}
}
}
final Object answer;
try {
// we have already awaited the result, zero time to wait here
answer = Await.result(jobSubmissionFuture, Duration.Zero());
} catch (Throwable throwable) {
throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
} finally {
// failsafe shutdown of the client actor
jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
// second block handles the actual response
if (answer instanceof JobManagerMessages.JobResultSuccess) {
LOG.info("Job execution complete");
SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
if (result != null) {
try {
return result.toJobExecutionResult(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
}
} else {
throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
}
} else if (answer instanceof JobManagerMessages.JobResultFailure) {
LOG.info("Job execution failed");
SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
if (serThrowable != null) {
Throwable cause = serThrowable.deserializeError(classLoader);
if (cause instanceof JobExecutionException) {
throw (JobExecutionException) cause;
} else {
throw new JobExecutionException(jobID, "Job execution failed", cause);
}
} else {
throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
}
} else if (answer instanceof JobManagerMessages.JobNotFound) {
throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
} else {
throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
}
}
use of akka.actor.ActorRef in project flink by apache.
the class JobClientActor method handleMessage.
@Override
protected void handleMessage(Object message) {
if (message instanceof ExecutionGraphMessages.ExecutionStateChanged) {
logAndPrintMessage((ExecutionGraphMessages.ExecutionStateChanged) message);
} else if (message instanceof ExecutionGraphMessages.JobStatusChanged) {
logAndPrintMessage((ExecutionGraphMessages.JobStatusChanged) message);
} else if (message instanceof JobManagerLeaderAddress) {
JobManagerLeaderAddress msg = (JobManagerLeaderAddress) message;
if (jobManager != null) {
// only print this message when we had been connected to a JobManager before
logAndPrintMessage("New JobManager elected. Connecting to " + msg.address());
}
disconnectFromJobManager();
this.leaderSessionID = msg.leaderSessionID();
if (msg.address() != null) {
// Resolve the job manager leader address to obtain an ActorRef
AkkaUtils.getActorRefFuture(msg.address(), getContext().system(), timeout).onSuccess(new OnSuccess<ActorRef>() {
@Override
public void onSuccess(ActorRef result) throws Throwable {
getSelf().tell(decorateMessage(new JobManagerActorRef(result)), ActorRef.noSender());
}
}, getContext().dispatcher());
}
} else if (message instanceof JobManagerActorRef) {
// Resolved JobManager ActorRef
JobManagerActorRef msg = (JobManagerActorRef) message;
connectToJobManager(msg.jobManager());
logAndPrintMessage("Connected to JobManager at " + msg.jobManager());
connectedToJobManager();
} else // client is only interested in the final job result
if (message instanceof JobManagerMessages.JobResultMessage) {
if (LOG.isDebugEnabled()) {
LOG.debug("Received {} message from JobManager", message.getClass().getSimpleName());
}
// forward the success to the original client
if (isClientConnected()) {
this.client.tell(decorateMessage(message), getSelf());
}
terminate();
} else if (message instanceof Terminated) {
ActorRef target = ((Terminated) message).getActor();
if (jobManager.equals(target)) {
LOG.info("Lost connection to JobManager {}. Triggering connection timeout.", jobManager.path());
disconnectFromJobManager();
// ConnectionTimeout extends RequiresLeaderSessionID
if (isClientConnected()) {
getContext().system().scheduler().scheduleOnce(timeout, getSelf(), decorateMessage(JobClientMessages.getConnectionTimeout()), getContext().dispatcher(), ActorRef.noSender());
}
} else {
LOG.warn("Received 'Terminated' for unknown actor " + target);
}
} else if (JobClientMessages.getConnectionTimeout().equals(message)) {
// check if we haven't found a job manager yet
if (!isJobManagerConnected()) {
final JobClientActorConnectionTimeoutException errorMessage = new JobClientActorConnectionTimeoutException("Lost connection to the JobManager.");
final Object replyMessage = decorateMessage(new Status.Failure(errorMessage));
if (isClientConnected()) {
client.tell(replyMessage, getSelf());
}
// Connection timeout reached, let's terminate
terminate();
}
} else if (!isJobManagerConnected() && getClientMessageClass().equals(message.getClass())) {
LOG.info("Received {} but there is no connection to a JobManager yet.", message);
// We want to submit/attach to a job, but we haven't found a job manager yet.
// Let's give him another chance to find a job manager within the given timeout.
getContext().system().scheduler().scheduleOnce(timeout, getSelf(), decorateMessage(JobClientMessages.getConnectionTimeout()), getContext().dispatcher(), ActorRef.noSender());
handleCustomMessage(message);
} else {
if (!toBeTerminated) {
handleCustomMessage(message);
} else {
// we're about to receive a PoisonPill because toBeTerminated == true
String msg = getClass().getName() + " is about to be terminated. Therefore, the " + "job submission cannot be executed.";
LOG.error(msg);
getSender().tell(decorateMessage(new Status.Failure(new Exception(msg))), ActorRef.noSender());
}
}
}
Aggregations