Search in sources :

Example 1 with JobManagerRegistration

use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.

the class ResourceManager method requestSlot.

/**
	 * Requests a slot from the resource manager.
	 *
	 * @param slotRequest Slot request
	 * @return Slot assignment
	 */
@RpcMethod
public RMSlotRequestReply requestSlot(UUID jobMasterLeaderID, UUID resourceManagerLeaderID, SlotRequest slotRequest) {
    log.info("Request slot with profile {} for job {} with allocation id {}.", slotRequest.getResourceProfile(), slotRequest.getJobId(), slotRequest.getAllocationId());
    JobID jobId = slotRequest.getJobId();
    JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
    if (jobManagerRegistration != null && jobMasterLeaderID.equals(jobManagerRegistration.getLeaderID()) && resourceManagerLeaderID.equals(leaderSessionId)) {
        return slotManager.requestSlot(slotRequest);
    } else {
        log.info("Ignoring slot request for unknown JobMaster with JobID {}", jobId);
        return new RMSlotRequestRejected(slotRequest.getAllocationId());
    }
}
Also used : RMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestRejected) JobID(org.apache.flink.api.common.JobID) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 2 with JobManagerRegistration

use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.

the class ResourceManager method registerJobManager.

// ------------------------------------------------------------------------
//  RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
    checkNotNull(resourceManagerLeaderId);
    checkNotNull(jobManagerLeaderId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (isValid(resourceManagerLeaderId)) {
        if (!jobLeaderIdService.containsJob(jobId)) {
            try {
                jobLeaderIdService.addJob(jobId);
            } catch (Exception e) {
                ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
                onFatalErrorAsync(exception);
                log.error("Could not add job {} to job leader id service.", jobId, e);
                return FlinkCompletableFuture.completedExceptionally(exception);
            }
        }
        log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
        Future<UUID> jobLeaderIdFuture;
        try {
            jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
        } catch (Exception e) {
            // we cannot check the job leader id so let's fail
            // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
            ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
            onFatalErrorAsync(exception);
            log.debug("Could not obtain the job leader id future to verify the correct job leader.");
            return FlinkCompletableFuture.completedExceptionally(exception);
        }
        Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
        Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
                if (isValid(resourceManagerLeaderId)) {
                    if (jobLeaderId.equals(jobManagerLeaderId)) {
                        if (jobManagerRegistrations.containsKey(jobId)) {
                            JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
                            if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
                                // same registration
                                log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
                            } else {
                                // tell old job manager that he is no longer the job leader
                                disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
                                JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                                jobManagerRegistrations.put(jobId, jobManagerRegistration);
                            }
                        } else {
                            // new registration for the job
                            JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                            jobManagerRegistrations.put(jobId, jobManagerRegistration);
                        }
                        log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
                        return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
                    } else {
                        log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
                        return new RegistrationResponse.Decline("Job manager leader id did not match.");
                    }
                } else {
                    log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
                    return new RegistrationResponse.Decline("Resource manager leader id changed.");
                }
            }
        }, getMainThreadExecutor());
        // handle exceptions which might have occurred in one of the futures inputs of combine
        return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
                if (throwable != null) {
                    if (log.isDebugEnabled()) {
                        log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
                    } else {
                        log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
                    }
                    return new RegistrationResponse.Decline(throwable.getMessage());
                } else {
                    return registrationResponse;
                }
            }
        }, getRpcService().getExecutor());
    } else {
        log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
        return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
    }
}
Also used : JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 3 with JobManagerRegistration

use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.

the class ResourceManager method closeJobManagerConnection.

/**
 * This method should be called by the framework once it detects that a currently registered job
 * manager has failed.
 *
 * @param jobId identifying the job whose leader shall be disconnected.
 * @param resourceRequirementHandling indicating how existing resource requirements for the
 *     corresponding job should be handled
 * @param cause The exception which cause the JobManager failed.
 */
protected void closeJobManagerConnection(JobID jobId, ResourceRequirementHandling resourceRequirementHandling, Exception cause) {
    JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
    if (jobManagerRegistration != null) {
        final ResourceID jobManagerResourceId = jobManagerRegistration.getJobManagerResourceID();
        final JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
        final JobMasterId jobMasterId = jobManagerRegistration.getJobMasterId();
        log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobMasterId, jobMasterGateway.getAddress(), jobId);
        jobManagerHeartbeatManager.unmonitorTarget(jobManagerResourceId);
        jmResourceIdRegistrations.remove(jobManagerResourceId);
        if (resourceRequirementHandling == ResourceRequirementHandling.CLEAR) {
            slotManager.clearResourceRequirements(jobId);
        }
        // tell the job manager about the disconnect
        jobMasterGateway.disconnectResourceManager(getFencingToken(), cause);
    } else {
        log.debug("There was no registered job manager for job {}.", jobId);
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)

Example 4 with JobManagerRegistration

use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.

the class ResourceManager method declareRequiredResources.

@Override
public CompletableFuture<Acknowledge> declareRequiredResources(JobMasterId jobMasterId, ResourceRequirements resourceRequirements, Time timeout) {
    final JobID jobId = resourceRequirements.getJobId();
    final JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
    if (null != jobManagerRegistration) {
        if (Objects.equals(jobMasterId, jobManagerRegistration.getJobMasterId())) {
            slotManager.processResourceRequirements(resourceRequirements);
            return CompletableFuture.completedFuture(Acknowledge.get());
        } else {
            return FutureUtils.completedExceptionally(new ResourceManagerException("The job leader's id " + jobManagerRegistration.getJobMasterId() + " does not match the received id " + jobMasterId + '.'));
        }
    } else {
        return FutureUtils.completedExceptionally(new ResourceManagerException("Could not find registered job manager for job " + jobId + '.'));
    }
}
Also used : ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobID(org.apache.flink.api.common.JobID) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)

Example 5 with JobManagerRegistration

use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.

the class ResourceManager method disconnectJobManager.

/**
	 * Disconnects the job manager which is connected for the given job from the resource manager.
	 *
	 * @param jobId identifying the job whose leader shall be disconnected
	 */
protected void disconnectJobManager(JobID jobId, Exception cause) {
    JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
    if (jobManagerRegistration != null) {
        log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobManagerRegistration.getLeaderID(), jobManagerRegistration.getJobManagerGateway().getAddress(), jobId);
        JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
        // tell the job manager about the disconnect
        jobMasterGateway.disconnectResourceManager(jobManagerRegistration.getLeaderID(), getLeaderSessionId(), cause);
    } else {
        log.debug("There was no registered job manager for job {}.", jobId);
    }
}
Also used : JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)

Aggregations

JobManagerRegistration (org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)6 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)3 ResourceManagerException (org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException)3 JobID (org.apache.flink.api.common.JobID)2 JobMasterRegistrationSuccess (org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess)2 RpcMethod (org.apache.flink.runtime.rpc.RpcMethod)2 UUID (java.util.UUID)1 CompletionException (java.util.concurrent.CompletionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)1 LeaderIdMismatchException (org.apache.flink.runtime.highavailability.LeaderIdMismatchException)1 JobMasterId (org.apache.flink.runtime.jobmaster.JobMasterId)1 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)1 UnknownTaskExecutorException (org.apache.flink.runtime.resourcemanager.exceptions.UnknownTaskExecutorException)1 RMSlotRequestRejected (org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestRejected)1 FlinkException (org.apache.flink.util.FlinkException)1