use of org.apache.flink.runtime.jobmaster.JobMasterGateway in project flink by apache.
the class TaskExecutor method offerSlotsToJobManager.
// ------------------------------------------------------------------------
// Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
log.debug("There is no job manager connection to the leader of job {}.", jobId);
} else {
if (taskSlotTable.hasAllocatedSlots(jobId)) {
log.info("Offer reserved slots to the leader of job {}.", jobId);
final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
final UUID leaderId = jobManagerConnection.getLeaderId();
final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
while (reservedSlotsIterator.hasNext()) {
SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
try {
if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
// the slot is either free or releasing at the moment
final String message = "Could not mark slot " + jobId + " active.";
log.debug(message);
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
}
} catch (SlotNotFoundException e) {
final String message = "Could not mark slot " + jobId + " active.";
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
continue;
}
reservedSlots.add(offer);
}
Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {
@Override
public void accept(Iterable<SlotOffer> acceptedSlots) {
// check if the response is still valid
if (isJobManagerConnectionValid(jobId, leaderId)) {
// mark accepted slots active
for (SlotOffer acceptedSlot : acceptedSlots) {
reservedSlots.remove(acceptedSlot);
}
final Exception e = new Exception("The slot was rejected by the JobManager.");
for (SlotOffer rejectedSlot : reservedSlots) {
freeSlot(rejectedSlot.getAllocationId(), e);
}
} else {
// discard the response since there is a new leader for the job
log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
}
}
}, getMainThreadExecutor());
acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable throwable) {
if (throwable instanceof TimeoutException) {
// We ran into a timeout. Try again.
offerSlotsToJobManager(jobId);
} else {
// We encountered an exception. Free the slots and return them to the RM.
for (SlotOffer reservedSlot : reservedSlots) {
freeSlot(reservedSlot.getAllocationId(), throwable);
}
}
return null;
}
}, getMainThreadExecutor());
} else {
log.debug("There are no unassigned slots for the job {}.", jobId);
}
}
}
use of org.apache.flink.runtime.jobmaster.JobMasterGateway in project flink by apache.
the class ResourceManager method registerJobManager.
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future<RegistrationResponse> registerJobManager(final UUID resourceManagerLeaderId, final UUID jobManagerLeaderId, final String jobManagerAddress, final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " + jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future<UUID> jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " + "job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future<JobMasterGateway> jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future<RegistrationResponse> registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction<JobMasterGateway, UUID, RegistrationResponse>() {
@Override
public RegistrationResponse apply(JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(oldJobManagerRegistration.getJobID(), new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(jobId, jobLeaderId, jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
return new JobMasterRegistrationSuccess(resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(), getLeaderSessionId());
} else {
log.debug("The job manager leader id {} did not match the job " + "leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " + "manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction<RegistrationResponse, Throwable, RegistrationResponse>() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " + "{} did not match the expected leader id {}.", jobManagerAddress, resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.<RegistrationResponse>completed(new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
use of org.apache.flink.runtime.jobmaster.JobMasterGateway in project flink by apache.
the class TaskExecutorTest method testJobLeaderDetection.
/**
* Tests that a TaskManager detects a job leader for which has reserved slots. Upon detecting
* the job leader, it will offer all reserved slots to the JobManager.
*/
@Test
public void testJobLeaderDetection() throws Exception {
final JobID jobId = new JobID();
final TestingSerialRpcService rpc = new TestingSerialRpcService();
final Configuration configuration = new Configuration();
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final ResourceID resourceId = new ResourceID("foobar");
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
final TimerService<AllocationID> timerService = mock(TimerService.class);
final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class)), timerService);
final JobManagerTable jobManagerTable = new JobManagerTable();
final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final TestingLeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
final TestingLeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
final String resourceManagerAddress = "rm";
final UUID resourceManagerLeaderId = UUID.randomUUID();
final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
final InstanceID registrationId = new InstanceID();
when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
final String jobManagerAddress = "jm";
final UUID jobManagerLeaderId = UUID.randomUUID();
final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
final int blobPort = 42;
final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
rpc.registerGateway(jobManagerAddress, jobMasterGateway);
final AllocationID allocationId = new AllocationID();
final SlotID slotId = new SlotID(resourceId, 0);
final SlotOffer slotOffer = new SlotOffer(allocationId, 0, ResourceProfile.UNKNOWN);
try {
TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
taskManager.start();
// tell the task manager about the rm leader
resourceManagerLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerLeaderId);
// request slots from the task manager under the given allocation id
TMSlotRequestReply reply = taskManager.requestSlot(slotId, jobId, allocationId, jobManagerAddress, resourceManagerLeaderId);
// this is hopefully successful :-)
assertTrue(reply instanceof TMSlotRequestRegistered);
// now inform the task manager about the new job leader
jobManagerLeaderRetrievalService.notifyListener(jobManagerAddress, jobManagerLeaderId);
// the job leader should get the allocation id offered
verify(jobMasterGateway).offerSlots(any(ResourceID.class), (Iterable<SlotOffer>) Matchers.argThat(contains(slotOffer)), eq(jobManagerLeaderId), any(Time.class));
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
use of org.apache.flink.runtime.jobmaster.JobMasterGateway in project flink by apache.
the class TaskExecutorTest method testSubmitTaskBeforeAcceptSlot.
/**
* This tests task executor receive SubmitTask before OfferSlot response.
*/
@Test
public void testSubmitTaskBeforeAcceptSlot() throws Exception {
final JobID jobId = new JobID();
final TestingSerialRpcService rpc = new TestingSerialRpcService();
final Configuration configuration = new Configuration();
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final ResourceID resourceId = new ResourceID("foobar");
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
final TimerService<AllocationID> timerService = mock(TimerService.class);
final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class), mock(ResourceProfile.class)), timerService);
final JobManagerTable jobManagerTable = new JobManagerTable();
final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final String resourceManagerAddress = "rm";
final UUID resourceManagerLeaderId = UUID.randomUUID();
final String jobManagerAddress = "jm";
final UUID jobManagerLeaderId = UUID.randomUUID();
final LeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService(resourceManagerAddress, resourceManagerLeaderId);
final LeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService(jobManagerAddress, jobManagerLeaderId);
haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
final InstanceID registrationId = new InstanceID();
when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
final int blobPort = 42;
final AllocationID allocationId1 = new AllocationID();
final AllocationID allocationId2 = new AllocationID();
final SlotOffer offer1 = new SlotOffer(allocationId1, 0, ResourceProfile.UNKNOWN);
final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
rpc.registerGateway(jobManagerAddress, jobMasterGateway);
final LibraryCacheManager libraryCacheManager = mock(LibraryCacheManager.class);
when(libraryCacheManager.getClassLoader(eq(jobId))).thenReturn(getClass().getClassLoader());
final JobManagerConnection jobManagerConnection = new JobManagerConnection(jobId, jmResourceId, jobMasterGateway, jobManagerLeaderId, mock(TaskManagerActions.class), mock(CheckpointResponder.class), libraryCacheManager, mock(ResultPartitionConsumableNotifier.class), mock(PartitionProducerStateChecker.class));
jobManagerTable.put(jobId, jobManagerConnection);
try {
final TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
taskManager.start();
taskSlotTable.allocateSlot(0, jobId, allocationId1, Time.milliseconds(10000L));
taskSlotTable.allocateSlot(1, jobId, allocationId2, Time.milliseconds(10000L));
final JobVertexID jobVertexId = new JobVertexID();
JobInformation jobInformation = new JobInformation(jobId, name.getMethodName(), new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
TaskInformation taskInformation = new TaskInformation(jobVertexId, "test task", 1, 1, TestInvokable.class.getName(), new Configuration());
SerializedValue<JobInformation> serializedJobInformation = new SerializedValue<>(jobInformation);
SerializedValue<TaskInformation> serializedJobVertexInformation = new SerializedValue<>(taskInformation);
final TaskDeploymentDescriptor tdd = new TaskDeploymentDescriptor(serializedJobInformation, serializedJobVertexInformation, new ExecutionAttemptID(), allocationId1, 0, 0, 0, null, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList());
CompletableFuture<Iterable<SlotOffer>> offerResultFuture = new FlinkCompletableFuture<>();
// submit task first and then return acceptance response
when(jobMasterGateway.offerSlots(any(ResourceID.class), any(Iterable.class), eq(jobManagerLeaderId), any(Time.class))).thenReturn(offerResultFuture);
// we have to add the job after the TaskExecutor, because otherwise the service has not
// been properly started. This will also offer the slots to the job master
jobLeaderService.addJob(jobId, jobManagerAddress);
verify(jobMasterGateway).offerSlots(any(ResourceID.class), any(Iterable.class), eq(jobManagerLeaderId), any(Time.class));
// submit the task without having acknowledge the offered slots
taskManager.submitTask(tdd, jobManagerLeaderId);
// acknowledge the offered slots
offerResultFuture.complete(Collections.singleton(offer1));
verify(resourceManagerGateway).notifySlotAvailable(eq(resourceManagerLeaderId), eq(registrationId), eq(new SlotID(resourceId, 1)));
assertTrue(taskSlotTable.existsActiveSlot(jobId, allocationId1));
assertFalse(taskSlotTable.existsActiveSlot(jobId, allocationId2));
assertTrue(taskSlotTable.isSlotFree(1));
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
use of org.apache.flink.runtime.jobmaster.JobMasterGateway in project flink by apache.
the class ResourceManager method disconnectJobManager.
/**
* Disconnects the job manager which is connected for the given job from the resource manager.
*
* @param jobId identifying the job whose leader shall be disconnected
*/
protected void disconnectJobManager(JobID jobId, Exception cause) {
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
if (jobManagerRegistration != null) {
log.info("Disconnect job manager {}@{} for job {} from the resource manager.", jobManagerRegistration.getLeaderID(), jobManagerRegistration.getJobManagerGateway().getAddress(), jobId);
JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
// tell the job manager about the disconnect
jobMasterGateway.disconnectResourceManager(jobManagerRegistration.getLeaderID(), getLeaderSessionId(), cause);
} else {
log.debug("There was no registered job manager for job {}.", jobId);
}
}
Aggregations