use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.
the class ResourceManager method requestSlot.
/**
* Requests a slot from the resource manager.
*
* @param slotRequest Slot request
* @return Slot assignment
*/
@RpcMethod
public RMSlotRequestReply requestSlot(UUID jobMasterLeaderID, UUID resourceManagerLeaderID, SlotRequest slotRequest) {
log.info("Request slot with profile {} for job {} with allocation id {}.", slotRequest.getResourceProfile(), slotRequest.getJobId(), slotRequest.getAllocationId());
JobID jobId = slotRequest.getJobId();
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
if (jobManagerRegistration != null && jobMasterLeaderID.equals(jobManagerRegistration.getLeaderID()) && resourceManagerLeaderID.equals(leaderSessionId)) {
return slotManager.requestSlot(slotRequest);
} else {
log.info("Ignoring slot request for unknown JobMaster with JobID {}", jobId);
return new RMSlotRequestRejected(slotRequest.getAllocationId());
}
}
use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.
the class ResourceManager method registerJobManager.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future<UUID> jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {
@Override
public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
} else {
log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.
the class SlotPool method releaseTaskManager.
/**
* Unregister TaskManager from this pool, all the related slots will be released and tasks be canceled. Called
* when we find some TaskManager becomes "dead" or "abnormal", and we decide to not using slots from it anymore.
*
* @param resourceID The id of the TaskManager
*/
@RpcMethod
public void releaseTaskManager(final ResourceID resourceID) {
if (registeredTaskManagers.remove(resourceID)) {
availableSlots.removeAllForTaskManager(resourceID);
final Set<Slot> allocatedSlotsForResource = allocatedSlots.removeSlotsForTaskManager(resourceID);
for (Slot slot : allocatedSlotsForResource) {
slot.releaseSlot();
}
}
}
use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.
the class JobMaster method offerSlots.
@RpcMethod
public Future<Iterable<SlotOffer>> offerSlots(final ResourceID taskManagerId, final Iterable<SlotOffer> slots, final UUID leaderId) throws Exception {
validateLeaderSessionId(leaderId);
Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId);
if (taskManager == null) {
throw new Exception("Unknown TaskManager " + taskManagerId);
}
final JobID jid = jobGraph.getJobID();
final TaskManagerLocation taskManagerLocation = taskManager.f0;
final TaskExecutorGateway taskExecutorGateway = taskManager.f1;
final ArrayList<Tuple2<AllocatedSlot, SlotOffer>> slotsAndOffers = new ArrayList<>();
final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, leaderId);
for (SlotOffer slotOffer : slots) {
final AllocatedSlot slot = new AllocatedSlot(slotOffer.getAllocationId(), jid, taskManagerLocation, slotOffer.getSlotIndex(), slotOffer.getResourceProfile(), rpcTaskManagerGateway);
slotsAndOffers.add(new Tuple2<>(slot, slotOffer));
}
return slotPoolGateway.offerSlots(slotsAndOffers);
}
use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.
the class SlotPool method offerSlot.
/**
* Slot offering by TaskManager with AllocationID. The AllocationID is originally generated by this pool and
* transfer through the ResourceManager to TaskManager. We use it to distinguish the different allocation
* we issued. Slot offering may be rejected if we find something mismatching or there is actually no pending
* request waiting for this slot (maybe fulfilled by some other returned slot).
*
* @param slot The offered slot
* @return True if we accept the offering
*/
@RpcMethod
public boolean offerSlot(final AllocatedSlot slot) {
validateRunsInMainThread();
// check if this TaskManager is valid
final ResourceID resourceID = slot.getTaskManagerId();
final AllocationID allocationID = slot.getSlotAllocationId();
if (!registeredTaskManagers.contains(resourceID)) {
LOG.debug("Received outdated slot offering [{}] from unregistered TaskManager: {}", slot.getSlotAllocationId(), slot);
return false;
}
// check whether we have already using this slot
if (allocatedSlots.contains(allocationID) || availableSlots.contains(allocationID)) {
LOG.debug("Received repeated offer for slot [{}]. Ignoring.", allocationID);
// and mark the offering as a success
return true;
}
// check whether we have request waiting for this slot
PendingRequest pendingRequest = pendingRequests.remove(allocationID);
if (pendingRequest != null) {
// we were waiting for this!
SimpleSlot resultSlot = createSimpleSlot(slot, Locality.UNKNOWN);
pendingRequest.future().complete(resultSlot);
allocatedSlots.add(resultSlot);
} else {
// we were actually not waiting for this:
// - could be that this request had been fulfilled
// - we are receiving the slots from TaskManagers after becoming leaders
availableSlots.add(slot, clock.relativeTimeMillis());
}
// too long and timed out
return true;
}
Aggregations