use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.
the class ResourceManager method requestSlot.
/**
* Requests a slot from the resource manager.
*
* @param slotRequest Slot request
* @return Slot assignment
*/
@RpcMethod
public RMSlotRequestReply requestSlot(UUID jobMasterLeaderID, UUID resourceManagerLeaderID, SlotRequest slotRequest) {
log.info("Request slot with profile {} for job {} with allocation id {}.", slotRequest.getResourceProfile(), slotRequest.getJobId(), slotRequest.getAllocationId());
JobID jobId = slotRequest.getJobId();
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
if (jobManagerRegistration != null && jobMasterLeaderID.equals(jobManagerRegistration.getLeaderID()) && resourceManagerLeaderID.equals(leaderSessionId)) {
return slotManager.requestSlot(slotRequest);
} else {
log.info("Ignoring slot request for unknown JobMaster with JobID {}", jobId);
return new RMSlotRequestRejected(slotRequest.getAllocationId());
}
}
use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.
the class ResourceManager method registerJobManager.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future<UUID> jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {
@Override
public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
} else {
log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.
the class ResourceManager method closeJobManagerConnection.
/**
* This method should be called by the framework once it detects that a currently registered job
* manager has failed.
*
* @param jobId identifying the job whose leader shall be disconnected.
* @param resourceRequirementHandling indicating how existing resource requirements for the
* corresponding job should be handled
* @param cause The exception which cause the JobManager failed.
*/
protected void closeJobManagerConnection(JobID jobId, ResourceRequirementHandling resourceRequirementHandling, Exception cause) {
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
if (jobManagerRegistration != null) {
final ResourceID jobManagerResourceId = jobManagerRegistration.getJobManagerResourceID();
final JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
final JobMasterId jobMasterId = jobManagerRegistration.getJobMasterId();
log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobMasterId, jobMasterGateway.getAddress(), jobId);
jobManagerHeartbeatManager.unmonitorTarget(jobManagerResourceId);
jmResourceIdRegistrations.remove(jobManagerResourceId);
if (resourceRequirementHandling == ResourceRequirementHandling.CLEAR) {
slotManager.clearResourceRequirements(jobId);
}
// tell the job manager about the disconnect
jobMasterGateway.disconnectResourceManager(getFencingToken(), cause);
} else {
log.debug("There was no registered job manager for job {}.", jobId);
}
}
use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.
the class ResourceManager method declareRequiredResources.
@Override
public CompletableFuture<Acknowledge> declareRequiredResources(JobMasterId jobMasterId, ResourceRequirements resourceRequirements, Time timeout) {
final JobID jobId = resourceRequirements.getJobId();
final JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
if (null != jobManagerRegistration) {
if (Objects.equals(jobMasterId, jobManagerRegistration.getJobMasterId())) {
slotManager.processResourceRequirements(resourceRequirements);
return CompletableFuture.completedFuture(Acknowledge.get());
} else {
return FutureUtils.completedExceptionally(new ResourceManagerException("The job leader's id " + jobManagerRegistration.getJobMasterId() + " does not match the received id " + jobMasterId + '.'));
}
} else {
return FutureUtils.completedExceptionally(new ResourceManagerException("Could not find registered job manager for job " + jobId + '.'));
}
}
use of org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration in project flink by apache.
the class ResourceManager method disconnectJobManager.
/**
* Disconnects the job manager which is connected for the given job from the resource manager.
*
* @param jobId identifying the job whose leader shall be disconnected
*/
protected void disconnectJobManager(JobID jobId, Exception cause) {
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
if (jobManagerRegistration != null) {
log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobManagerRegistration.getLeaderID(), jobManagerRegistration.getJobManagerGateway().getAddress(), jobId);
JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
// tell the job manager about the disconnect
jobMasterGateway.disconnectResourceManager(jobManagerRegistration.getLeaderID(), getLeaderSessionId(), cause);
} else {
log.debug("There was no registered job manager for job {}.", jobId);
}
}
Aggregations