use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.
the class ResourceManager method registerJobManager.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future<UUID> jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {
@Override
public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
} else {
log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.
the class SlotProtocolTest method testSlotAvailableRequest.
/**
* Tests whether
* 1) a SlotRequest is routed to the SlotManager
* 2) a SlotRequest is confirmed
* 3) a SlotRequest leads to an allocation of a registered slot
* 4) a SlotRequest is routed to the TaskExecutor
*/
@Test
public void testSlotAvailableRequest() throws Exception {
final String rmAddress = "/rm1";
final String jmAddress = "/jm1";
final String tmAddress = "/tm1";
final JobID jobID = new JobID();
testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
final UUID rmLeaderID = UUID.randomUUID();
final UUID jmLeaderID = UUID.randomUUID();
TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
testRpcService.registerGateway(tmAddress, taskExecutorGateway);
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
ResourceManager<ResourceID> resourceManager = Mockito.spy(new StandaloneResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class)));
resourceManager.start();
rmLeaderElectionService.isLeader(rmLeaderID);
Thread.sleep(1000);
Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
try {
registrationFuture.get(5L, TimeUnit.SECONDS);
} catch (Exception e) {
Assert.fail("JobManager registration Future didn't become ready.");
}
final SlotManager slotManager = slotManagerFactory.slotManager;
final ResourceID resourceID = ResourceID.generate();
final AllocationID allocationID = new AllocationID();
final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
final SlotID slotID = new SlotID(resourceID, 0);
final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
// register slot at SlotManager
slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
// 1) a SlotRequest is routed to the SlotManager
verify(slotManager).requestSlot(slotRequest);
// 2) a SlotRequest is confirmed
Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
// 3) a SlotRequest leads to an allocation of a registered slot
Assert.assertTrue(slotManager.isAllocated(slotID));
Assert.assertTrue(slotManager.isAllocated(allocationID));
// 4) a SlotRequest is routed to the TaskExecutor
verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.
the class ResourceManagerJobMasterTest method testRegisterJobMasterWithUnmatchedLeaderSessionId1.
/**
* Test receive registration with unmatched leadershipId from job master
*/
@Test
public void testRegisterJobMasterWithUnmatchedLeaderSessionId1() throws Exception {
String jobMasterAddress = "/jobMasterAddress1";
JobID jobID = mockJobMaster(jobMasterAddress);
TestingLeaderElectionService resourceManagerLeaderElectionService = new TestingLeaderElectionService();
UUID jmLeaderID = UUID.randomUUID();
TestingLeaderRetrievalService jobMasterLeaderRetrievalService = new TestingLeaderRetrievalService(jobMasterAddress, jmLeaderID);
TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final ResourceManager resourceManager = createAndStartResourceManager(resourceManagerLeaderElectionService, jobID, jobMasterLeaderRetrievalService, testingFatalErrorHandler);
final UUID rmLeaderSessionId = grantResourceManagerLeadership(resourceManagerLeaderElectionService);
// test throw exception when receive a registration from job master which takes unmatched leaderSessionId
UUID differentLeaderSessionID = UUID.randomUUID();
Future<RegistrationResponse> unMatchedLeaderFuture = resourceManager.registerJobManager(differentLeaderSessionID, jmLeaderID, jobMasterAddress, jobID);
assertTrue(unMatchedLeaderFuture.get(5, TimeUnit.SECONDS) instanceof RegistrationResponse.Decline);
if (testingFatalErrorHandler.hasExceptionOccurred()) {
testingFatalErrorHandler.rethrowError();
}
}
use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.
the class ResourceManager method registerJobMaster.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@Override
public CompletableFuture<RegistrationResponse> registerJobMaster(final JobMasterId jobMasterId, final ResourceID jobManagerResourceId, final String jobManagerAddress, final JobID jobId, final Time timeout) {
checkNotNull(jobMasterId);
checkNotNull(jobManagerResourceId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalError(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FutureUtils.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobMasterId, jobManagerAddress, jobId);
CompletableFuture<JobMasterId> jobMasterIdFuture;
try {
jobMasterIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader
// id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalError(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FutureUtils.completedExceptionally(exception);
}
CompletableFuture<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, jobMasterId, JobMasterGateway.class);
CompletableFuture<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobMasterIdFuture, (JobMasterGateway jobMasterGateway, JobMasterId leadingJobMasterId) -> {
if (Objects.equals(leadingJobMasterId, jobMasterId)) {
return registerJobMasterInternal(jobMasterGateway, jobId, jobManagerAddress, jobManagerResourceId);
} else {
final String declineMessage = String.format("The leading JobMaster id %s did not match the received JobMaster id %s. " + "This indicates that a JobMaster leader change has happened.", leadingJobMasterId, jobMasterId);
log.debug(declineMessage);
return new RegistrationResponse.Failure(new FlinkException(declineMessage));
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync((RegistrationResponse registrationResponse, Throwable throwable) -> {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress);
}
return new RegistrationResponse.Failure(throwable);
} else {
return registrationResponse;
}
}, ioExecutor);
}
use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.
the class JobMaster method registerTaskManager.
@Override
public CompletableFuture<RegistrationResponse> registerTaskManager(final JobID jobId, final TaskManagerRegistrationInformation taskManagerRegistrationInformation, final Time timeout) {
if (!jobGraph.getJobID().equals(jobId)) {
log.debug("Rejecting TaskManager registration attempt because of wrong job id {}.", jobId);
return CompletableFuture.completedFuture(new JMTMRegistrationRejection(String.format("The JobManager is not responsible for job %s. Maybe the TaskManager used outdated connection information.", jobId)));
}
final TaskManagerLocation taskManagerLocation;
try {
taskManagerLocation = resolveTaskManagerLocation(taskManagerRegistrationInformation.getUnresolvedTaskManagerLocation());
} catch (FlinkException exception) {
log.error("Could not accept TaskManager registration.", exception);
return CompletableFuture.completedFuture(new RegistrationResponse.Failure(exception));
}
final ResourceID taskManagerId = taskManagerLocation.getResourceID();
final UUID sessionId = taskManagerRegistrationInformation.getTaskManagerSession();
final TaskManagerRegistration taskManagerRegistration = registeredTaskManagers.get(taskManagerId);
if (taskManagerRegistration != null) {
if (taskManagerRegistration.getSessionId().equals(sessionId)) {
log.debug("Ignoring registration attempt of TaskManager {} with the same session id {}.", taskManagerId, sessionId);
final RegistrationResponse response = new JMTMRegistrationSuccess(resourceId);
return CompletableFuture.completedFuture(response);
} else {
disconnectTaskManager(taskManagerId, new FlinkException("A registered TaskManager re-registered with a new session id. This indicates a restart of the TaskManager. Closing the old connection."));
}
}
return getRpcService().connect(taskManagerRegistrationInformation.getTaskManagerRpcAddress(), TaskExecutorGateway.class).handleAsync((TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> {
if (throwable != null) {
return new RegistrationResponse.Failure(throwable);
}
slotPoolService.registerTaskManager(taskManagerId);
registeredTaskManagers.put(taskManagerId, TaskManagerRegistration.create(taskManagerLocation, taskExecutorGateway, sessionId));
// monitor the task manager as heartbeat target
taskManagerHeartbeatManager.monitorTarget(taskManagerId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
return new JMTMRegistrationSuccess(resourceId);
}, getMainThreadExecutor());
}
Aggregations