Search in sources :

Example 1 with ResourceManagerException

use of org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException in project flink by apache.

the class ResourceManager method registerJobManager.

// ------------------------------------------------------------------------
//  RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
    checkNotNull(resourceManagerLeaderId);
    checkNotNull(jobManagerLeaderId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (isValid(resourceManagerLeaderId)) {
        if (!jobLeaderIdService.containsJob(jobId)) {
            try {
                jobLeaderIdService.addJob(jobId);
            } catch (Exception e) {
                ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
                onFatalErrorAsync(exception);
                log.error("Could not add job {} to job leader id service.", jobId, e);
                return FlinkCompletableFuture.completedExceptionally(exception);
            }
        }
        log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
        Future<UUID> jobLeaderIdFuture;
        try {
            jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
        } catch (Exception e) {
            // we cannot check the job leader id so let's fail
            // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
            ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
            onFatalErrorAsync(exception);
            log.debug("Could not obtain the job leader id future to verify the correct job leader.");
            return FlinkCompletableFuture.completedExceptionally(exception);
        }
        Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
        Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
                if (isValid(resourceManagerLeaderId)) {
                    if (jobLeaderId.equals(jobManagerLeaderId)) {
                        if (jobManagerRegistrations.containsKey(jobId)) {
                            JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
                            if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
                                // same registration
                                log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
                            } else {
                                // tell old job manager that he is no longer the job leader
                                disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
                                JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                                jobManagerRegistrations.put(jobId, jobManagerRegistration);
                            }
                        } else {
                            // new registration for the job
                            JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                            jobManagerRegistrations.put(jobId, jobManagerRegistration);
                        }
                        log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
                        return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
                    } else {
                        log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
                        return new RegistrationResponse.Decline("Job manager leader id did not match.");
                    }
                } else {
                    log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
                    return new RegistrationResponse.Decline("Resource manager leader id changed.");
                }
            }
        }, getMainThreadExecutor());
        // handle exceptions which might have occurred in one of the futures inputs of combine
        return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
                if (throwable != null) {
                    if (log.isDebugEnabled()) {
                        log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
                    } else {
                        log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
                    }
                    return new RegistrationResponse.Decline(throwable.getMessage());
                } else {
                    return registrationResponse;
                }
            }
        }, getRpcService().getExecutor());
    } else {
        log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
        return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
    }
}
Also used : JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 2 with ResourceManagerException

use of org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException in project flink by apache.

the class ResourceManager method start.

// ------------------------------------------------------------------------
//  RPC lifecycle methods
// ------------------------------------------------------------------------
@Override
public void start() throws Exception {
    // start a leader
    super.start();
    try {
        // SlotManager should start first
        slotManager = slotManagerFactory.create(createResourceManagerServices());
    } catch (Exception e) {
        throw new ResourceManagerException("Could not create the slot manager.", e);
    }
    leaderElectionService = highAvailabilityServices.getResourceManagerLeaderElectionService();
    try {
        leaderElectionService.start(this);
    } catch (Exception e) {
        throw new ResourceManagerException("Could not start the leader election service.", e);
    }
    try {
        jobLeaderIdService.start(new JobLeaderIdActionsImpl());
    } catch (Exception e) {
        throw new ResourceManagerException("Could not start the job leader id service.", e);
    }
    initialize();
}
Also used : ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException)

Aggregations

LeaderIdMismatchException (org.apache.flink.runtime.highavailability.LeaderIdMismatchException)2 ResourceManagerException (org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException)2 UUID (java.util.UUID)1 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)1 JobMasterRegistrationSuccess (org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess)1 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)1 JobManagerRegistration (org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)1 RpcMethod (org.apache.flink.runtime.rpc.RpcMethod)1