Search in sources :

Example 6 with Timeout

use of akka.util.Timeout in project flink by apache.

the class YarnClusterClient method getNewMessages.

@Override
public List<String> getNewMessages() {
    if (hasBeenShutdown()) {
        throw new RuntimeException("The YarnClusterClient has already been stopped");
    }
    if (!isConnected) {
        throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
    }
    List<String> ret = new ArrayList<String>();
    // get messages from ApplicationClient (locally)
    while (true) {
        Object result;
        try {
            Future<Object> response = Patterns.ask(applicationClient.get(), YarnMessages.getLocalGetYarnMessage(), new Timeout(akkaDuration));
            result = Await.result(response, akkaDuration);
        } catch (Exception ioe) {
            LOG.warn("Error retrieving the YARN messages locally", ioe);
            break;
        }
        if (!(result instanceof Option)) {
            throw new RuntimeException("LocalGetYarnMessage requires a response of type " + "Option. Instead the response is of type " + result.getClass() + ".");
        } else {
            Option messageOption = (Option) result;
            LOG.debug("Received message option {}", messageOption);
            if (messageOption.isEmpty()) {
                break;
            } else {
                Object obj = messageOption.get();
                if (obj instanceof InfoMessage) {
                    InfoMessage msg = (InfoMessage) obj;
                    ret.add("[" + msg.date() + "] " + msg.message());
                } else {
                    LOG.warn("LocalGetYarnMessage returned unexpected type: " + messageOption);
                }
            }
        }
    }
    return ret;
}
Also used : Timeout(akka.util.Timeout) InfoMessage(org.apache.flink.runtime.clusterframework.messages.InfoMessage) ArrayList(java.util.ArrayList) Option(scala.Option) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException)

Example 7 with Timeout

use of akka.util.Timeout in project flink by apache.

the class YarnClusterClient method shutdownCluster.

/**
	 * Shuts down the Yarn application
	 */
public void shutdownCluster() {
    if (hasBeenShutDown.getAndSet(true)) {
        return;
    }
    if (!isConnected) {
        throw new IllegalStateException("The cluster has been not been connected to the ApplicationMaster.");
    }
    try {
        Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
    } catch (IllegalStateException e) {
    // we are already in the shutdown hook
    }
    LOG.info("Sending shutdown request to the Application Master");
    try {
        Future<Object> response = Patterns.ask(applicationClient.get(), new YarnMessages.LocalStopYarnSession(getApplicationStatus(), "Flink YARN Client requested shutdown"), new Timeout(akkaDuration));
        Await.ready(response, akkaDuration);
    } catch (Exception e) {
        LOG.warn("Error while stopping YARN cluster.", e);
    }
    try {
        File propertiesFile = FlinkYarnSessionCli.getYarnPropertiesLocation(flinkConfig);
        if (propertiesFile.isFile()) {
            if (propertiesFile.delete()) {
                LOG.info("Deleted Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
            } else {
                LOG.warn("Couldn't delete Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
            }
        }
    } catch (Exception e) {
        LOG.warn("Exception while deleting the JobManager address file", e);
    }
    if (sessionFilesDir != null) {
        LOG.info("Deleting files in " + sessionFilesDir);
        try {
            FileSystem shutFS = FileSystem.get(hadoopConfig);
            // delete conf and jar file.
            shutFS.delete(sessionFilesDir, true);
            shutFS.close();
        } catch (IOException e) {
            LOG.error("Could not delete the Flink jar and configuration files in HDFS..", e);
        }
    } else {
        LOG.warn("Session file directory not set. Not deleting session files");
    }
    try {
        pollingRunner.stopRunner();
        pollingRunner.join(1000);
    } catch (InterruptedException e) {
        LOG.warn("Shutdown of the polling runner was interrupted", e);
        Thread.currentThread().interrupt();
    }
    try {
        ApplicationReport appReport = yarnClient.getApplicationReport(appId);
        LOG.info("Application " + appId + " finished with state " + appReport.getYarnApplicationState() + " and final state " + appReport.getFinalApplicationStatus() + " at " + appReport.getFinishTime());
        if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED || appReport.getYarnApplicationState() == YarnApplicationState.KILLED) {
            LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics());
            LOG.warn("If log aggregation is activated in the Hadoop cluster, we recommend to retrieve " + "the full application log using this command:" + System.lineSeparator() + "\tyarn logs -applicationId " + appReport.getApplicationId() + System.lineSeparator() + "(It sometimes takes a few seconds until the logs are aggregated)");
        }
    } catch (Exception e) {
        LOG.warn("Couldn't get final report", e);
    }
    LOG.info("YARN Client is shutting down");
    // actorRunner is using the yarnClient.
    yarnClient.stop();
    // set null to clearly see if somebody wants to access it afterwards.
    yarnClient = null;
}
Also used : ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) Timeout(akka.util.Timeout) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) File(java.io.File) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException)

Example 8 with Timeout

use of akka.util.Timeout in project flink by apache.

the class JobClientActorTest method testConnectionTimeoutAfterJobSubmission.

/** Tests that a {@link org.apache.flink.runtime.client.JobClientActorConnectionTimeoutException}
	 * is thrown after a successful job submission if the JobManager dies.
	 *
	 * @throws Exception
	 */
@Test(expected = JobClientActorConnectionTimeoutException.class)
public void testConnectionTimeoutAfterJobSubmission() throws Exception {
    FiniteDuration jobClientActorTimeout = new FiniteDuration(5, TimeUnit.SECONDS);
    FiniteDuration timeout = jobClientActorTimeout.$times(2);
    UUID leaderSessionID = UUID.randomUUID();
    ActorRef jobManager = system.actorOf(Props.create(JobAcceptingActor.class, leaderSessionID));
    TestingLeaderRetrievalService testingLeaderRetrievalService = new TestingLeaderRetrievalService(jobManager.path().toString(), leaderSessionID);
    Props jobClientActorProps = JobSubmissionClientActor.createActorProps(testingLeaderRetrievalService, jobClientActorTimeout, false, clientConfig);
    ActorRef jobClientActor = system.actorOf(jobClientActorProps);
    Future<Object> jobExecutionResult = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(testJobGraph), new Timeout(timeout));
    Future<Object> waitFuture = Patterns.ask(jobManager, new RegisterTest(), new Timeout(timeout));
    Await.result(waitFuture, timeout);
    jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
    Await.result(jobExecutionResult, timeout);
}
Also used : TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ActorRef(akka.actor.ActorRef) JobClientMessages(org.apache.flink.runtime.messages.JobClientMessages) Timeout(akka.util.Timeout) FiniteDuration(scala.concurrent.duration.FiniteDuration) UUID(java.util.UUID) Props(akka.actor.Props) Test(org.junit.Test)

Example 9 with Timeout

use of akka.util.Timeout in project flink by apache.

the class JobClientActorTest method testSubmissionTimeout.

/** Tests that a {@link JobClientActorSubmissionTimeoutException} is thrown when the job cannot
	 * be submitted by the JobSubmissionClientActor. This is here the case, because the started JobManager
	 * never replies to a {@link SubmitJob} message.
	 *
	 * @throws Exception
	 */
@Test(expected = JobClientActorSubmissionTimeoutException.class)
public void testSubmissionTimeout() throws Exception {
    FiniteDuration jobClientActorTimeout = new FiniteDuration(5, TimeUnit.SECONDS);
    FiniteDuration timeout = jobClientActorTimeout.$times(2);
    UUID leaderSessionID = UUID.randomUUID();
    ActorRef jobManager = system.actorOf(Props.create(PlainActor.class, leaderSessionID));
    TestingLeaderRetrievalService testingLeaderRetrievalService = new TestingLeaderRetrievalService(jobManager.path().toString(), leaderSessionID);
    Props jobClientActorProps = JobSubmissionClientActor.createActorProps(testingLeaderRetrievalService, jobClientActorTimeout, false, clientConfig);
    ActorRef jobClientActor = system.actorOf(jobClientActorProps);
    Future<Object> jobExecutionResult = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(testJobGraph), new Timeout(timeout));
    Await.result(jobExecutionResult, timeout);
}
Also used : TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ActorRef(akka.actor.ActorRef) JobClientMessages(org.apache.flink.runtime.messages.JobClientMessages) Timeout(akka.util.Timeout) FiniteDuration(scala.concurrent.duration.FiniteDuration) UUID(java.util.UUID) Props(akka.actor.Props) Test(org.junit.Test)

Example 10 with Timeout

use of akka.util.Timeout in project flink by apache.

the class FlinkClient method getTopologyJobId.

// Flink specific additional methods
/**
	 * Package internal method to get a Flink {@link JobID} from a Storm topology name.
	 *
	 * @param id
	 * 		The Storm topology name.
	 * @return Flink's internally used {@link JobID}.
	 */
JobID getTopologyJobId(final String id) {
    final Configuration configuration = GlobalConfiguration.loadConfiguration();
    if (this.timeout != null) {
        configuration.setString(ConfigConstants.AKKA_ASK_TIMEOUT, this.timeout);
    }
    try {
        final ActorRef jobManager = this.getJobManager();
        final FiniteDuration askTimeout = this.getTimeout();
        final Future<Object> response = Patterns.ask(jobManager, JobManagerMessages.getRequestRunningJobsStatus(), new Timeout(askTimeout));
        final Object result;
        try {
            result = Await.result(response, askTimeout);
        } catch (final Exception e) {
            throw new RuntimeException("Could not retrieve running jobs from the JobManager", e);
        }
        if (result instanceof RunningJobsStatus) {
            final List<JobStatusMessage> jobs = ((RunningJobsStatus) result).getStatusMessages();
            for (final JobStatusMessage status : jobs) {
                if (status.getJobName().equals(id)) {
                    return status.getJobId();
                }
            }
        } else {
            throw new RuntimeException("ReqeustRunningJobs requires a response of type " + "RunningJobs. Instead the response is of type " + result.getClass() + ".");
        }
    } catch (final IOException e) {
        throw new RuntimeException("Could not connect to Flink JobManager with address " + this.jobManagerHost + ":" + this.jobManagerPort, e);
    }
    return null;
}
Also used : RunningJobsStatus(org.apache.flink.runtime.messages.JobManagerMessages.RunningJobsStatus) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) ActorRef(akka.actor.ActorRef) Timeout(akka.util.Timeout) JobStatusMessage(org.apache.flink.runtime.client.JobStatusMessage) FiniteDuration(scala.concurrent.duration.FiniteDuration) IOException(java.io.IOException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) AlreadyAliveException(org.apache.storm.generated.AlreadyAliveException) IOException(java.io.IOException) NotAliveException(org.apache.storm.generated.NotAliveException) InvalidTopologyException(org.apache.storm.generated.InvalidTopologyException)

Aggregations

Timeout (akka.util.Timeout)18 ActorRef (akka.actor.ActorRef)11 FiniteDuration (scala.concurrent.duration.FiniteDuration)11 Props (akka.actor.Props)8 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)7 JobClientMessages (org.apache.flink.runtime.messages.JobClientMessages)7 Test (org.junit.Test)7 UUID (java.util.UUID)5 IOException (java.io.IOException)4 ProgramInvocationException (org.apache.flink.client.program.ProgramInvocationException)4 TimeoutException (java.util.concurrent.TimeoutException)3 AttachToJobAndWait (org.apache.flink.runtime.messages.JobClientMessages.AttachToJobAndWait)3 ArrayList (java.util.ArrayList)2 JobStatusMessage (org.apache.flink.runtime.client.JobStatusMessage)2 YarnException (org.apache.hadoop.yarn.exceptions.YarnException)2 ActorSelection (akka.actor.ActorSelection)1 JavaTestKit (akka.testkit.JavaTestKit)1 File (java.io.File)1 JobID (org.apache.flink.api.common.JobID)1 Configuration (org.apache.flink.configuration.Configuration)1