Search in sources :

Example 1 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class ResourceManager method registerJobManager.

// ------------------------------------------------------------------------
//  RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
    checkNotNull(resourceManagerLeaderId);
    checkNotNull(jobManagerLeaderId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (isValid(resourceManagerLeaderId)) {
        if (!jobLeaderIdService.containsJob(jobId)) {
            try {
                jobLeaderIdService.addJob(jobId);
            } catch (Exception e) {
                ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
                onFatalErrorAsync(exception);
                log.error("Could not add job {} to job leader id service.", jobId, e);
                return FlinkCompletableFuture.completedExceptionally(exception);
            }
        }
        log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
        Future<UUID> jobLeaderIdFuture;
        try {
            jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
        } catch (Exception e) {
            // we cannot check the job leader id so let's fail
            // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
            ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
            onFatalErrorAsync(exception);
            log.debug("Could not obtain the job leader id future to verify the correct job leader.");
            return FlinkCompletableFuture.completedExceptionally(exception);
        }
        Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
        Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
                if (isValid(resourceManagerLeaderId)) {
                    if (jobLeaderId.equals(jobManagerLeaderId)) {
                        if (jobManagerRegistrations.containsKey(jobId)) {
                            JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
                            if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
                                // same registration
                                log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
                            } else {
                                // tell old job manager that he is no longer the job leader
                                disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
                                JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                                jobManagerRegistrations.put(jobId, jobManagerRegistration);
                            }
                        } else {
                            // new registration for the job
                            JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
                            jobManagerRegistrations.put(jobId, jobManagerRegistration);
                        }
                        log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
                        return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
                    } else {
                        log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
                        return new RegistrationResponse.Decline("Job manager leader id did not match.");
                    }
                } else {
                    log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
                    return new RegistrationResponse.Decline("Resource manager leader id changed.");
                }
            }
        }, getMainThreadExecutor());
        // handle exceptions which might have occurred in one of the futures inputs of combine
        return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {

            @Override
            public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
                if (throwable != null) {
                    if (log.isDebugEnabled()) {
                        log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
                    } else {
                        log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
                    }
                    return new RegistrationResponse.Decline(throwable.getMessage());
                } else {
                    return registrationResponse;
                }
            }
        }, getRpcService().getExecutor());
    } else {
        log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
        return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
    }
}
Also used : JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) JobManagerRegistration(org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 2 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class SlotProtocolTest method testSlotAvailableRequest.

/**
	 * Tests whether
	 * 1) a SlotRequest is routed to the SlotManager
	 * 2) a SlotRequest is confirmed
	 * 3) a SlotRequest leads to an allocation of a registered slot
	 * 4) a SlotRequest is routed to the TaskExecutor
	 */
@Test
public void testSlotAvailableRequest() throws Exception {
    final String rmAddress = "/rm1";
    final String jmAddress = "/jm1";
    final String tmAddress = "/tm1";
    final JobID jobID = new JobID();
    testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
    final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
    final UUID rmLeaderID = UUID.randomUUID();
    final UUID jmLeaderID = UUID.randomUUID();
    TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
    TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
    Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
    testRpcService.registerGateway(tmAddress, taskExecutorGateway);
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
    ResourceManager<ResourceID> resourceManager = Mockito.spy(new StandaloneResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class)));
    resourceManager.start();
    rmLeaderElectionService.isLeader(rmLeaderID);
    Thread.sleep(1000);
    Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
    try {
        registrationFuture.get(5L, TimeUnit.SECONDS);
    } catch (Exception e) {
        Assert.fail("JobManager registration Future didn't become ready.");
    }
    final SlotManager slotManager = slotManagerFactory.slotManager;
    final ResourceID resourceID = ResourceID.generate();
    final AllocationID allocationID = new AllocationID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
    final SlotID slotID = new SlotID(resourceID, 0);
    final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
    final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
    // register slot at SlotManager
    slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
    SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
    RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
    // 1) a SlotRequest is routed to the SlotManager
    verify(slotManager).requestSlot(slotRequest);
    // 2) a SlotRequest is confirmed
    Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
    // 3) a SlotRequest leads to an allocation of a registered slot
    Assert.assertTrue(slotManager.isAllocated(slotID));
    Assert.assertTrue(slotManager.isAllocated(allocationID));
    // 4) a SlotRequest is routed to the TaskExecutor
    verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TaskExecutorRegistration(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorRegistration) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) RMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestReply) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingSlotManager(org.apache.flink.runtime.resourcemanager.TestingSlotManager) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class ResourceManagerJobMasterTest method testRegisterJobMasterWithUnmatchedLeaderSessionId1.

/**
	 * Test receive registration with unmatched leadershipId from job master
	 */
@Test
public void testRegisterJobMasterWithUnmatchedLeaderSessionId1() throws Exception {
    String jobMasterAddress = "/jobMasterAddress1";
    JobID jobID = mockJobMaster(jobMasterAddress);
    TestingLeaderElectionService resourceManagerLeaderElectionService = new TestingLeaderElectionService();
    UUID jmLeaderID = UUID.randomUUID();
    TestingLeaderRetrievalService jobMasterLeaderRetrievalService = new TestingLeaderRetrievalService(jobMasterAddress, jmLeaderID);
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final ResourceManager resourceManager = createAndStartResourceManager(resourceManagerLeaderElectionService, jobID, jobMasterLeaderRetrievalService, testingFatalErrorHandler);
    final UUID rmLeaderSessionId = grantResourceManagerLeadership(resourceManagerLeaderElectionService);
    // test throw exception when receive a registration from job master which takes unmatched leaderSessionId
    UUID differentLeaderSessionID = UUID.randomUUID();
    Future<RegistrationResponse> unMatchedLeaderFuture = resourceManager.registerJobManager(differentLeaderSessionID, jmLeaderID, jobMasterAddress, jobID);
    assertTrue(unMatchedLeaderFuture.get(5, TimeUnit.SECONDS) instanceof RegistrationResponse.Decline);
    if (testingFatalErrorHandler.hasExceptionOccurred()) {
        testingFatalErrorHandler.rethrowError();
    }
}
Also used : TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 4 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class ResourceManager method registerJobMaster.

// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@Override
public CompletableFuture<RegistrationResponse> registerJobMaster(final JobMasterId jobMasterId, final ResourceID jobManagerResourceId, final String jobManagerAddress, final JobID jobId, final Time timeout) {
    checkNotNull(jobMasterId);
    checkNotNull(jobManagerResourceId);
    checkNotNull(jobManagerAddress);
    checkNotNull(jobId);
    if (!jobLeaderIdService.containsJob(jobId)) {
        try {
            jobLeaderIdService.addJob(jobId);
        } catch (Exception e) {
            ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
            onFatalError(exception);
            log.error("Could not add job {} to job leader id service.", jobId, e);
            return FutureUtils.completedExceptionally(exception);
        }
    }
    log.info("Registering job manager {}@{} for job {}.", jobMasterId, jobManagerAddress, jobId);
    CompletableFuture<JobMasterId> jobMasterIdFuture;
    try {
        jobMasterIdFuture = jobLeaderIdService.getLeaderId(jobId);
    } catch (Exception e) {
        // we cannot check the job leader id so let's fail
        // TODO: Maybe it's also ok to skip this check in case that we cannot check the leader
        // id
        ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
        onFatalError(exception);
        log.debug("Could not obtain the job leader id future to verify the correct job leader.");
        return FutureUtils.completedExceptionally(exception);
    }
    CompletableFuture<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, jobMasterId, JobMasterGateway.class);
    CompletableFuture<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobMasterIdFuture, (JobMasterGateway jobMasterGateway, JobMasterId leadingJobMasterId) -> {
        if (Objects.equals(leadingJobMasterId, jobMasterId)) {
            return registerJobMasterInternal(jobMasterGateway, jobId, jobManagerAddress, jobManagerResourceId);
        } else {
            final String declineMessage = String.format("The leading JobMaster id %s did not match the received JobMaster id %s. " + "This indicates that a JobMaster leader change has happened.", leadingJobMasterId, jobMasterId);
            log.debug(declineMessage);
            return new RegistrationResponse.Failure(new FlinkException(declineMessage));
        }
    }, getMainThreadExecutor());
    // handle exceptions which might have occurred in one of the futures inputs of combine
    return registrationResponseFuture.handleAsync((RegistrationResponse registrationResponse, Throwable throwable) -> {
        if (throwable != null) {
            if (log.isDebugEnabled()) {
                log.debug("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress, throwable);
            } else {
                log.info("Registration of job manager {}@{} failed.", jobMasterId, jobManagerAddress);
            }
            return new RegistrationResponse.Failure(throwable);
        } else {
            return registrationResponse;
        }
    }, ioExecutor);
}
Also used : JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TimeoutException(java.util.concurrent.TimeoutException) CompletionException(java.util.concurrent.CompletionException) FlinkException(org.apache.flink.util.FlinkException) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) UnknownTaskExecutorException(org.apache.flink.runtime.resourcemanager.exceptions.UnknownTaskExecutorException) FlinkException(org.apache.flink.util.FlinkException)

Example 5 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class JobMaster method registerTaskManager.

@Override
public CompletableFuture<RegistrationResponse> registerTaskManager(final JobID jobId, final TaskManagerRegistrationInformation taskManagerRegistrationInformation, final Time timeout) {
    if (!jobGraph.getJobID().equals(jobId)) {
        log.debug("Rejecting TaskManager registration attempt because of wrong job id {}.", jobId);
        return CompletableFuture.completedFuture(new JMTMRegistrationRejection(String.format("The JobManager is not responsible for job %s. Maybe the TaskManager used outdated connection information.", jobId)));
    }
    final TaskManagerLocation taskManagerLocation;
    try {
        taskManagerLocation = resolveTaskManagerLocation(taskManagerRegistrationInformation.getUnresolvedTaskManagerLocation());
    } catch (FlinkException exception) {
        log.error("Could not accept TaskManager registration.", exception);
        return CompletableFuture.completedFuture(new RegistrationResponse.Failure(exception));
    }
    final ResourceID taskManagerId = taskManagerLocation.getResourceID();
    final UUID sessionId = taskManagerRegistrationInformation.getTaskManagerSession();
    final TaskManagerRegistration taskManagerRegistration = registeredTaskManagers.get(taskManagerId);
    if (taskManagerRegistration != null) {
        if (taskManagerRegistration.getSessionId().equals(sessionId)) {
            log.debug("Ignoring registration attempt of TaskManager {} with the same session id {}.", taskManagerId, sessionId);
            final RegistrationResponse response = new JMTMRegistrationSuccess(resourceId);
            return CompletableFuture.completedFuture(response);
        } else {
            disconnectTaskManager(taskManagerId, new FlinkException("A registered TaskManager re-registered with a new session id. This indicates a restart of the TaskManager. Closing the old connection."));
        }
    }
    return getRpcService().connect(taskManagerRegistrationInformation.getTaskManagerRpcAddress(), TaskExecutorGateway.class).handleAsync((TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> {
        if (throwable != null) {
            return new RegistrationResponse.Failure(throwable);
        }
        slotPoolService.registerTaskManager(taskManagerId);
        registeredTaskManagers.put(taskManagerId, TaskManagerRegistration.create(taskManagerLocation, taskExecutorGateway, sessionId));
        // monitor the task manager as heartbeat target
        taskManagerHeartbeatManager.monitorTarget(taskManagerId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
        return new JMTMRegistrationSuccess(resourceId);
    }, getMainThreadExecutor());
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) FlinkException(org.apache.flink.util.FlinkException)

Aggregations

RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)40 Test (org.junit.Test)35 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)23 CompletableFuture (java.util.concurrent.CompletableFuture)18 UUID (java.util.UUID)14 JobID (org.apache.flink.api.common.JobID)14 ArrayList (java.util.ArrayList)12 FlinkException (org.apache.flink.util.FlinkException)11 Time (org.apache.flink.api.common.time.Time)10 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)10 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)10 TestingResourceManagerGateway (org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway)9 LocalUnresolvedTaskManagerLocation (org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation)9 ExecutionException (java.util.concurrent.ExecutionException)8 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)8 UnresolvedTaskManagerLocation (org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation)8 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)8 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)7 TimeoutException (java.util.concurrent.TimeoutException)7 Configuration (org.apache.flink.configuration.Configuration)7