Search in sources :

Example 6 with RpcMethod

use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.

the class ResourceManager method requestSlot.

/**
	 * Requests a slot from the resource manager.
	 *
	 * @param slotRequest Slot request
	 * @return Slot assignment
	 */
@RpcMethod
public RMSlotRequestReply requestSlot(UUID jobMasterLeaderID, UUID resourceManagerLeaderID, SlotRequest slotRequest) {
    log.info("Request slot with profile {} for job {} with allocation id {}.", slotRequest.getResourceProfile(), slotRequest.getJobId(), slotRequest.getAllocationId());
    JobID jobId = slotRequest.getJobId();
    JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
    if (jobManagerRegistration != null && jobMasterLeaderID.equals(jobManagerRegistration.getLeaderID()) && resourceManagerLeaderID.equals(leaderSessionId)) {
        return slotManager.requestSlot(slotRequest);
    } else {
        log.info("Ignoring slot request for unknown JobMaster with JobID {}", jobId);
        return new RMSlotRequestRejected(slotRequest.getAllocationId());
    }
}
Also used : RMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestRejected) JobID(org.apache.flink.api.common.JobID) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 7 with RpcMethod

use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.

the class ResourceManager method registerJobManager.

// ------------------------------------------------------------------------
//  RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
    checkNotNull(resourceManagerLeaderId);
    checkNotNull(jobManagerLeaderId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (isValid(resourceManagerLeaderId)) {
        if (!jobLeaderIdService.containsJob(jobId)) {
            try {
                jobLeaderIdService.addJob(jobId);
            } catch (Exception e) {
                ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
                onFatalErrorAsync(exception);
                log.error("Could not add job {} to job leader id service.", jobId, e);
                return FlinkCompletableFuture.completedExceptionally(exception);
            }
        }
        log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
        Future<UUID> jobLeaderIdFuture;
        try {
            jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
        } catch (Exception e) {
            // we cannot check the job leader id so let's fail
            // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
            ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
            onFatalErrorAsync(exception);
            log.debug("Could not obtain the job leader id future to verify the correct job leader.");
            return FlinkCompletableFuture.completedExceptionally(exception);
        }
        Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
        Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
                if (isValid(resourceManagerLeaderId)) {
                    if (jobLeaderId.equals(jobManagerLeaderId)) {
                        if (jobManagerRegistrations.containsKey(jobId)) {
                            JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
                            if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
                                // same registration
                                log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
                            } else {
                                // tell old job manager that he is no longer the job leader
                                disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
                                JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                                jobManagerRegistrations.put(jobId, jobManagerRegistration);
                            }
                        } else {
                            // new registration for the job
                            JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                            jobManagerRegistrations.put(jobId, jobManagerRegistration);
                        }
                        log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
                        return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
                    } else {
                        log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
                        return new RegistrationResponse.Decline("Job manager leader id did not match.");
                    }
                } else {
                    log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
                    return new RegistrationResponse.Decline("Resource manager leader id changed.");
                }
            }
        }, getMainThreadExecutor());
        // handle exceptions which might have occurred in one of the futures inputs of combine
        return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
                if (throwable != null) {
                    if (log.isDebugEnabled()) {
                        log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
                    } else {
                        log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
                    }
                    return new RegistrationResponse.Decline(throwable.getMessage());
                } else {
                    return registrationResponse;
                }
            }
        }, getRpcService().getExecutor());
    } else {
        log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
        return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
    }
}
Also used : JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 8 with RpcMethod

use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.

the class SlotPool method releaseTaskManager.

/**
	 * Unregister TaskManager from this pool, all the related slots will be released and tasks be canceled. Called
	 * when we find some TaskManager becomes "dead" or "abnormal", and we decide to not using slots from it anymore.
	 *
	 * @param resourceID The id of the TaskManager
	 */
@RpcMethod
public void releaseTaskManager(final ResourceID resourceID) {
    if (registeredTaskManagers.remove(resourceID)) {
        availableSlots.removeAllForTaskManager(resourceID);
        final Set<Slot> allocatedSlotsForResource = allocatedSlots.removeSlotsForTaskManager(resourceID);
        for (Slot slot : allocatedSlotsForResource) {
            slot.releaseSlot();
        }
    }
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 9 with RpcMethod

use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.

the class JobMaster method offerSlots.

@RpcMethod
public Future<Iterable<SlotOffer>> offerSlots(final ResourceID taskManagerId, final Iterable<SlotOffer> slots, final UUID leaderId) throws Exception {
    validateLeaderSessionId(leaderId);
    Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId);
    if (taskManager == null) {
        throw new Exception("Unknown TaskManager " + taskManagerId);
    }
    final JobID jid = jobGraph.getJobID();
    final TaskManagerLocation taskManagerLocation = taskManager.f0;
    final TaskExecutorGateway taskExecutorGateway = taskManager.f1;
    final ArrayList<Tuple2<AllocatedSlot, SlotOffer>> slotsAndOffers = new ArrayList<>();
    final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, leaderId);
    for (SlotOffer slotOffer : slots) {
        final AllocatedSlot slot = new AllocatedSlot(slotOffer.getAllocationId(), jid, taskManagerLocation, slotOffer.getSlotIndex(), slotOffer.getResourceProfile(), rpcTaskManagerGateway);
        slotsAndOffers.add(new Tuple2<>(slot, slotOffer));
    }
    return slotPoolGateway.offerSlots(slotsAndOffers);
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) Tuple2(org.apache.flink.api.java.tuple.Tuple2) ArrayList(java.util.ArrayList) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TimeoutException(java.util.concurrent.TimeoutException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) JobID(org.apache.flink.api.common.JobID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 10 with RpcMethod

use of org.apache.flink.runtime.rpc.RpcMethod in project flink by apache.

the class SlotPool method offerSlot.

/**
	 * Slot offering by TaskManager with AllocationID. The AllocationID is originally generated by this pool and
	 * transfer through the ResourceManager to TaskManager. We use it to distinguish the different allocation
	 * we issued. Slot offering may be rejected if we find something mismatching or there is actually no pending
	 * request waiting for this slot (maybe fulfilled by some other returned slot).
	 *
	 * @param slot The offered slot
	 * @return True if we accept the offering
	 */
@RpcMethod
public boolean offerSlot(final AllocatedSlot slot) {
    validateRunsInMainThread();
    // check if this TaskManager is valid
    final ResourceID resourceID = slot.getTaskManagerId();
    final AllocationID allocationID = slot.getSlotAllocationId();
    if (!registeredTaskManagers.contains(resourceID)) {
        LOG.debug("Received outdated slot offering [{}] from unregistered TaskManager: {}", slot.getSlotAllocationId(), slot);
        return false;
    }
    // check whether we have already using this slot
    if (allocatedSlots.contains(allocationID) || availableSlots.contains(allocationID)) {
        LOG.debug("Received repeated offer for slot [{}]. Ignoring.", allocationID);
        // and mark the offering as a success
        return true;
    }
    // check whether we have request waiting for this slot
    PendingRequest pendingRequest = pendingRequests.remove(allocationID);
    if (pendingRequest != null) {
        // we were waiting for this!
        SimpleSlot resultSlot = createSimpleSlot(slot, Locality.UNKNOWN);
        pendingRequest.future().complete(resultSlot);
        allocatedSlots.add(resultSlot);
    } else {
        // we were actually not waiting for this:
        //   - could be that this request had been fulfilled
        //   - we are receiving the slots from TaskManagers after becoming leaders
        availableSlots.add(slot, clock.relativeTimeMillis());
    }
    // too long and timed out
    return true;
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Aggregations

RpcMethod (org.apache.flink.runtime.rpc.RpcMethod)18 Task (org.apache.flink.runtime.taskmanager.Task)6 IOException (java.io.IOException)5 TimeoutException (java.util.concurrent.TimeoutException)4 LeaderIdMismatchException (org.apache.flink.runtime.highavailability.LeaderIdMismatchException)4 PartitionProducerDisposedException (org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException)4 JobID (org.apache.flink.api.common.JobID)3 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)3 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)3 AllocatedSlot (org.apache.flink.runtime.jobmanager.slots.AllocatedSlot)3 CheckpointException (org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException)3 TaskException (org.apache.flink.runtime.taskexecutor.exceptions.TaskException)3 CheckpointCoordinator (org.apache.flink.runtime.checkpoint.CheckpointCoordinator)2 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)2 Execution (org.apache.flink.runtime.executiongraph.Execution)2 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)2 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)2 JobManagerRegistration (org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration)2 PartitionException (org.apache.flink.runtime.taskexecutor.exceptions.PartitionException)2 TaskSubmissionException (org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException)2