Search in sources :

Example 11 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.

/**
	 * Tests the metric registry life cycle on JobManager re-connects.
	 */
@Test
public void testMetricRegistryLifeCycle() throws Exception {
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        // ================================================================
        // Start JobManager
        // ================================================================
        final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        // ================================================================
        // Start TaskManager
        // ================================================================
        final Configuration config = new Configuration();
        final ResourceID tmResourceID = ResourceID.generate();
        TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
        TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
        TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
        final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
        // create the task manager
        final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        // wait for the TM to be registered
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                        // trigger re-registration of TM; this should include a disconnect from the current JM
                        taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
                        // wait for re-registration to be completed
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // verify that the registry was not shutdown due to the disconnect
        Assert.assertFalse(tmRegistry.isShutdown());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) ActorRef(akka.actor.ActorRef) TaskManagerServices(org.apache.flink.runtime.taskexecutor.TaskManagerServices) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobManager(org.apache.flink.runtime.jobmanager.JobManager) Props(akka.actor.Props) TaskManagerMessages(org.apache.flink.runtime.messages.TaskManagerMessages) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 12 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobClient method attachToRunningJob.

/**
	 * Attaches to a running Job using the JobID.
	 * Reconstructs the user class loader by downloading the jars from the JobManager.
	 */
public static JobListeningContext attachToRunningJob(JobID jobID, ActorGateway jobManagerGateWay, Configuration configuration, ActorSystem actorSystem, LeaderRetrievalService leaderRetrievalService, FiniteDuration timeout, boolean sysoutLogUpdates) {
    checkNotNull(jobID, "The jobID must not be null.");
    checkNotNull(jobManagerGateWay, "The jobManagerGateWay must not be null.");
    checkNotNull(configuration, "The configuration must not be null.");
    checkNotNull(actorSystem, "The actorSystem must not be null.");
    checkNotNull(leaderRetrievalService, "The jobManagerGateway must not be null.");
    checkNotNull(timeout, "The timeout must not be null.");
    // we create a proxy JobClientActor that deals with all communication with
    // the JobManager. It forwards the job attachments, checks the success/failure responses, logs
    // update messages, watches for disconnect between client and JobManager, ...
    Props jobClientActorProps = JobAttachmentClientActor.createActorProps(leaderRetrievalService, timeout, sysoutLogUpdates);
    ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
    Future<Object> attachmentFuture = Patterns.ask(jobClientActor, new JobClientMessages.AttachToJobAndWait(jobID), new Timeout(AkkaUtils.INF_TIMEOUT()));
    return new JobListeningContext(jobID, attachmentFuture, jobClientActor, timeout, actorSystem, configuration);
}
Also used : ActorRef(akka.actor.ActorRef) JobClientMessages(org.apache.flink.runtime.messages.JobClientMessages) Timeout(akka.util.Timeout) Props(akka.actor.Props)

Example 13 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobClient method submitJob.

/**
	 * Submits a job to a Flink cluster (non-blocking) and returns a JobListeningContext which can be
	 * passed to {@code awaitJobResult} to get the result of the submission.
	 * @return JobListeningContext which may be used to retrieve the JobExecutionResult via
	 * 			{@code awaitJobResult(JobListeningContext context)}.
	 */
public static JobListeningContext submitJob(ActorSystem actorSystem, Configuration config, LeaderRetrievalService leaderRetrievalService, JobGraph jobGraph, FiniteDuration timeout, boolean sysoutLogUpdates, ClassLoader classLoader) {
    checkNotNull(actorSystem, "The actorSystem must not be null.");
    checkNotNull(leaderRetrievalService, "The jobManagerGateway must not be null.");
    checkNotNull(jobGraph, "The jobGraph must not be null.");
    checkNotNull(timeout, "The timeout must not be null.");
    // for this job, we create a proxy JobClientActor that deals with all communication with
    // the JobManager. It forwards the job submission, checks the success/failure responses, logs
    // update messages, watches for disconnect between client and JobManager, ...
    Props jobClientActorProps = JobSubmissionClientActor.createActorProps(leaderRetrievalService, timeout, sysoutLogUpdates, config);
    ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
    Future<Object> submissionFuture = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(jobGraph), new Timeout(AkkaUtils.INF_TIMEOUT()));
    return new JobListeningContext(jobGraph.getJobID(), submissionFuture, jobClientActor, timeout, classLoader);
}
Also used : ActorRef(akka.actor.ActorRef) JobClientMessages(org.apache.flink.runtime.messages.JobClientMessages) Timeout(akka.util.Timeout) Props(akka.actor.Props)

Example 14 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobClient method awaitJobResult.

/**
	 * Given a JobListeningContext, awaits the result of the job execution that this context is bound to
	 * @param listeningContext The listening context of the job execution
	 * @return The result of the execution
	 * @throws JobExecutionException if anything goes wrong while monitoring the job
	 */
public static JobExecutionResult awaitJobResult(JobListeningContext listeningContext) throws JobExecutionException {
    final JobID jobID = listeningContext.getJobID();
    final ActorRef jobClientActor = listeningContext.getJobClientActor();
    final Future<Object> jobSubmissionFuture = listeningContext.getJobResultFuture();
    final FiniteDuration askTimeout = listeningContext.getTimeout();
    // retrieves class loader if necessary
    final ClassLoader classLoader = listeningContext.getClassLoader();
    // ping the JobClientActor from time to time to check if it is still running
    while (!jobSubmissionFuture.isCompleted()) {
        try {
            Await.ready(jobSubmissionFuture, askTimeout);
        } catch (InterruptedException e) {
            throw new JobExecutionException(jobID, "Interrupted while waiting for job completion.");
        } catch (TimeoutException e) {
            try {
                Await.result(Patterns.ask(jobClientActor, // Ping the Actor to see if it is alive
                new Identify(true), Timeout.durationToTimeout(askTimeout)), askTimeout);
            // we got a reply, continue waiting for the job result
            } catch (Exception eInner) {
                // thus the health check failed
                if (!jobSubmissionFuture.isCompleted()) {
                    throw new JobExecutionException(jobID, "JobClientActor seems to have died before the JobExecutionResult could be retrieved.", eInner);
                }
            }
        }
    }
    final Object answer;
    try {
        // we have already awaited the result, zero time to wait here
        answer = Await.result(jobSubmissionFuture, Duration.Zero());
    } catch (Throwable throwable) {
        throw new JobExecutionException(jobID, "Couldn't retrieve the JobExecutionResult from the JobManager.", throwable);
    } finally {
        // failsafe shutdown of the client actor
        jobClientActor.tell(PoisonPill.getInstance(), ActorRef.noSender());
    }
    // second block handles the actual response
    if (answer instanceof JobManagerMessages.JobResultSuccess) {
        LOG.info("Job execution complete");
        SerializedJobExecutionResult result = ((JobManagerMessages.JobResultSuccess) answer).result();
        if (result != null) {
            try {
                return result.toJobExecutionResult(classLoader);
            } catch (Throwable t) {
                throw new JobExecutionException(jobID, "Job was successfully executed but JobExecutionResult could not be deserialized.");
            }
        } else {
            throw new JobExecutionException(jobID, "Job was successfully executed but result contained a null JobExecutionResult.");
        }
    } else if (answer instanceof JobManagerMessages.JobResultFailure) {
        LOG.info("Job execution failed");
        SerializedThrowable serThrowable = ((JobManagerMessages.JobResultFailure) answer).cause();
        if (serThrowable != null) {
            Throwable cause = serThrowable.deserializeError(classLoader);
            if (cause instanceof JobExecutionException) {
                throw (JobExecutionException) cause;
            } else {
                throw new JobExecutionException(jobID, "Job execution failed", cause);
            }
        } else {
            throw new JobExecutionException(jobID, "Job execution failed with null as failure cause.");
        }
    } else if (answer instanceof JobManagerMessages.JobNotFound) {
        throw new JobRetrievalException(((JobManagerMessages.JobNotFound) answer).jobID(), "Couldn't retrieve Job " + jobID + " because it was not running.");
    } else {
        throw new JobExecutionException(jobID, "Unknown answer from JobManager after submitting the job: " + answer);
    }
}
Also used : ActorRef(akka.actor.ActorRef) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) Identify(akka.actor.Identify) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) FlinkUserCodeClassLoader(org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException) SerializedThrowable(org.apache.flink.runtime.util.SerializedThrowable)

Example 15 with ActorRef

use of akka.actor.ActorRef in project flink by apache.

the class JobClientActor method handleMessage.

@Override
protected void handleMessage(Object message) {
    if (message instanceof ExecutionGraphMessages.ExecutionStateChanged) {
        logAndPrintMessage((ExecutionGraphMessages.ExecutionStateChanged) message);
    } else if (message instanceof ExecutionGraphMessages.JobStatusChanged) {
        logAndPrintMessage((ExecutionGraphMessages.JobStatusChanged) message);
    } else if (message instanceof JobManagerLeaderAddress) {
        JobManagerLeaderAddress msg = (JobManagerLeaderAddress) message;
        if (jobManager != null) {
            // only print this message when we had been connected to a JobManager before
            logAndPrintMessage("New JobManager elected. Connecting to " + msg.address());
        }
        disconnectFromJobManager();
        this.leaderSessionID = msg.leaderSessionID();
        if (msg.address() != null) {
            // Resolve the job manager leader address to obtain an ActorRef
            AkkaUtils.getActorRefFuture(msg.address(), getContext().system(), timeout).onSuccess(new OnSuccess<ActorRef>() {

                @Override
                public void onSuccess(ActorRef result) throws Throwable {
                    getSelf().tell(decorateMessage(new JobManagerActorRef(result)), ActorRef.noSender());
                }
            }, getContext().dispatcher());
        }
    } else if (message instanceof JobManagerActorRef) {
        // Resolved JobManager ActorRef
        JobManagerActorRef msg = (JobManagerActorRef) message;
        connectToJobManager(msg.jobManager());
        logAndPrintMessage("Connected to JobManager at " + msg.jobManager());
        connectedToJobManager();
    } else // client is only interested in the final job result
    if (message instanceof JobManagerMessages.JobResultMessage) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Received {} message from JobManager", message.getClass().getSimpleName());
        }
        // forward the success to the original client
        if (isClientConnected()) {
            this.client.tell(decorateMessage(message), getSelf());
        }
        terminate();
    } else if (message instanceof Terminated) {
        ActorRef target = ((Terminated) message).getActor();
        if (jobManager.equals(target)) {
            LOG.info("Lost connection to JobManager {}. Triggering connection timeout.", jobManager.path());
            disconnectFromJobManager();
            // ConnectionTimeout extends RequiresLeaderSessionID
            if (isClientConnected()) {
                getContext().system().scheduler().scheduleOnce(timeout, getSelf(), decorateMessage(JobClientMessages.getConnectionTimeout()), getContext().dispatcher(), ActorRef.noSender());
            }
        } else {
            LOG.warn("Received 'Terminated' for unknown actor " + target);
        }
    } else if (JobClientMessages.getConnectionTimeout().equals(message)) {
        // check if we haven't found a job manager yet
        if (!isJobManagerConnected()) {
            final JobClientActorConnectionTimeoutException errorMessage = new JobClientActorConnectionTimeoutException("Lost connection to the JobManager.");
            final Object replyMessage = decorateMessage(new Status.Failure(errorMessage));
            if (isClientConnected()) {
                client.tell(replyMessage, getSelf());
            }
            // Connection timeout reached, let's terminate
            terminate();
        }
    } else if (!isJobManagerConnected() && getClientMessageClass().equals(message.getClass())) {
        LOG.info("Received {} but there is no connection to a JobManager yet.", message);
        // We want to submit/attach to a job, but we haven't found a job manager yet.
        // Let's give him another chance to find a job manager within the given timeout.
        getContext().system().scheduler().scheduleOnce(timeout, getSelf(), decorateMessage(JobClientMessages.getConnectionTimeout()), getContext().dispatcher(), ActorRef.noSender());
        handleCustomMessage(message);
    } else {
        if (!toBeTerminated) {
            handleCustomMessage(message);
        } else {
            // we're about to receive a PoisonPill because toBeTerminated == true
            String msg = getClass().getName() + " is about to be terminated. Therefore, the " + "job submission cannot be executed.";
            LOG.error(msg);
            getSender().tell(decorateMessage(new Status.Failure(new Exception(msg))), ActorRef.noSender());
        }
    }
}
Also used : JobStatus(org.apache.flink.runtime.jobgraph.JobStatus) Status(akka.actor.Status) JobManagerLeaderAddress(org.apache.flink.runtime.messages.JobClientMessages.JobManagerLeaderAddress) JobManagerActorRef(org.apache.flink.runtime.messages.JobClientMessages.JobManagerActorRef) ActorRef(akka.actor.ActorRef) Terminated(akka.actor.Terminated) JobManagerActorRef(org.apache.flink.runtime.messages.JobClientMessages.JobManagerActorRef) ExecutionGraphMessages(org.apache.flink.runtime.messages.ExecutionGraphMessages)

Aggregations

ActorRef (akka.actor.ActorRef)383 Test (org.junit.Test)253 TestActorRef (akka.testkit.TestActorRef)124 TestKit (akka.testkit.javadsl.TestKit)84 ActorSystem (akka.actor.ActorSystem)53 FiniteDuration (scala.concurrent.duration.FiniteDuration)53 Props (akka.actor.Props)46 Timeout (akka.util.Timeout)43 Configuration (org.apache.flink.configuration.Configuration)42 AbstractShardManagerTest (org.opendaylight.controller.cluster.datastore.AbstractShardManagerTest)38 UpdateSchemaContext (org.opendaylight.controller.cluster.datastore.messages.UpdateSchemaContext)37 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)33 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)30 ActorInitialized (org.opendaylight.controller.cluster.datastore.messages.ActorInitialized)26 NormalizedNodeAggregatorTest (org.opendaylight.controller.cluster.datastore.utils.NormalizedNodeAggregatorTest)26 AddressFromURIString (akka.actor.AddressFromURIString)24 ArrayList (java.util.ArrayList)22 JobID (org.apache.flink.api.common.JobID)22 IOException (java.io.IOException)21 RoleChangeNotification (org.opendaylight.controller.cluster.notifications.RoleChangeNotification)20