use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class ResourceManager method notifySlotAvailable.
/**
* Notification from a TaskExecutor that a slot has become available
* @param resourceManagerLeaderId TaskExecutor's resource manager leader id
* @param instanceID TaskExecutor's instance id
* @param slotId The slot id of the available slot
* @return SlotAvailableReply
*/
@RpcMethod
public void notifySlotAvailable(final UUID resourceManagerLeaderId, final InstanceID instanceID, final SlotID slotId) {
if (resourceManagerLeaderId.equals(leaderSessionId)) {
final ResourceID resourceId = slotId.getResourceID();
WorkerRegistration<WorkerType> registration = taskExecutors.get(resourceId);
if (registration != null) {
InstanceID registrationId = registration.getInstanceID();
if (registrationId.equals(instanceID)) {
slotManager.notifySlotAvailable(resourceId, slotId);
} else {
log.debug("Invalid registration id for slot available message. This indicates an" + " outdated request.");
}
} else {
log.debug("Could not find registration for resource id {}. Discarding the slot available" + "message {}.", resourceId, slotId);
}
} else {
log.debug("Discarding notify slot available message for slot {}, because the " + "leader id {} did not match the expected leader id {}.", slotId, resourceManagerLeaderId, leaderSessionId);
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class SlotManager method addFreeSlot.
/**
* Add free slots directly to the free pool, this will not trigger pending requests allocation
*
* @param slot The resource slot
*/
@VisibleForTesting
void addFreeSlot(final ResourceSlot slot) {
final ResourceID resourceId = slot.getResourceID();
final SlotID slotId = slot.getSlotId();
if (!registeredSlots.containsKey(resourceId)) {
registeredSlots.put(resourceId, new HashMap<SlotID, ResourceSlot>());
}
registeredSlots.get(resourceId).put(slot.getSlotId(), slot);
freeSlots.put(slotId, slot);
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class SlotManager method registerNewSlot.
/**
* Registers a new slot with the SlotManager.
*
* @param slot The ResourceSlot which will be registered
*/
private void registerNewSlot(final ResourceSlot slot) {
final SlotID slotId = slot.getSlotId();
final ResourceID resourceId = slotId.getResourceID();
if (!registeredSlots.containsKey(resourceId)) {
registeredSlots.put(resourceId, new HashMap<SlotID, ResourceSlot>());
}
registeredSlots.get(resourceId).put(slotId, slot);
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class FlinkResourceManager method jobManagerLeaderConnected.
/**
* Callback when we're informed about a new leading JobManager.
* @param newJobManagerLeader The ActorRef of the new jobManager
* @param workers The existing workers the JobManager has registered.
*/
private void jobManagerLeaderConnected(ActorRef newJobManagerLeader, Collection<ResourceID> workers) {
if (jobManager == null) {
LOG.info("Resource Manager associating with leading JobManager {} - leader session {}", newJobManagerLeader, leaderSessionID);
jobManager = newJobManagerLeader;
if (workers.size() > 0) {
LOG.info("Received TaskManagers that were registered at the leader JobManager. " + "Trying to consolidate.");
// keep track of which TaskManagers are not handled
Set<ResourceID> toHandle = new HashSet<>(workers.size());
toHandle.addAll(workers);
try {
// ask the framework to tell us which ones we should keep for now
Collection<WorkerType> consolidated = reacceptRegisteredWorkers(workers);
LOG.info("Consolidated {} TaskManagers", consolidated.size());
// put the consolidated TaskManagers into our bookkeeping
for (WorkerType worker : consolidated) {
ResourceID resourceID = worker.getResourceID();
startedWorkers.put(resourceID, worker);
toHandle.remove(resourceID);
}
} catch (Throwable t) {
LOG.error("Error during consolidation of known TaskManagers", t);
// the framework should release the remaining unclear resources
for (ResourceID id : toHandle) {
releasePendingWorker(id);
}
}
}
// trigger initial check for requesting new workers
checkWorkersPool();
} else {
String msg = "Attempting to associate with new JobManager leader " + newJobManagerLeader + " without previously disassociating from current leader " + jobManager;
fatalError(msg, new Exception(msg));
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class TaskExecutorTest method testJobLeaderDetection.
/**
* Tests that a TaskManager detects a job leader for which has reserved slots. Upon detecting
* the job leader, it will offer all reserved slots to the JobManager.
*/
@Test
public void testJobLeaderDetection() throws Exception {
final JobID jobId = new JobID();
final TestingSerialRpcService rpc = new TestingSerialRpcService();
final Configuration configuration = new Configuration();
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final ResourceID resourceId = new ResourceID("foobar");
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
final TimerService<AllocationID> timerService = mock(TimerService.class);
final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class)), timerService);
final JobManagerTable jobManagerTable = new JobManagerTable();
final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final TestingLeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
final TestingLeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
final String resourceManagerAddress = "rm";
final UUID resourceManagerLeaderId = UUID.randomUUID();
final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
final InstanceID registrationId = new InstanceID();
when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
final String jobManagerAddress = "jm";
final UUID jobManagerLeaderId = UUID.randomUUID();
final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
final int blobPort = 42;
final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
rpc.registerGateway(jobManagerAddress, jobMasterGateway);
final AllocationID allocationId = new AllocationID();
final SlotID slotId = new SlotID(resourceId, 0);
final SlotOffer slotOffer = new SlotOffer(allocationId, 0, ResourceProfile.UNKNOWN);
try {
TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
taskManager.start();
// tell the task manager about the rm leader
resourceManagerLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerLeaderId);
// request slots from the task manager under the given allocation id
TMSlotRequestReply reply = taskManager.requestSlot(slotId, jobId, allocationId, jobManagerAddress, resourceManagerLeaderId);
// this is hopefully successful :-)
assertTrue(reply instanceof TMSlotRequestRegistered);
// now inform the task manager about the new job leader
jobManagerLeaderRetrievalService.notifyListener(jobManagerAddress, jobManagerLeaderId);
// the job leader should get the allocation id offered
verify(jobMasterGateway).offerSlots(any(ResourceID.class), (Iterable<SlotOffer>) Matchers.argThat(contains(slotOffer)), eq(jobManagerLeaderId), any(Time.class));
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
Aggregations