Search in sources :

Example 1 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class ResourceManager method notifySlotAvailable.

/**
	 * Notification from a TaskExecutor that a slot has become available
	 * @param resourceManagerLeaderId TaskExecutor's resource manager leader id
	 * @param instanceID TaskExecutor's instance id
	 * @param slotId The slot id of the available slot
	 * @return SlotAvailableReply
	 */
@RpcMethod
public void notifySlotAvailable(final UUID resourceManagerLeaderId, final InstanceID instanceID, final SlotID slotId) {
    if (resourceManagerLeaderId.equals(leaderSessionId)) {
        final ResourceID resourceId = slotId.getResourceID();
        WorkerRegistration<WorkerType> registration = taskExecutors.get(resourceId);
        if (registration != null) {
            InstanceID registrationId = registration.getInstanceID();
            if (registrationId.equals(instanceID)) {
                slotManager.notifySlotAvailable(resourceId, slotId);
            } else {
                log.debug("Invalid registration id for slot available message. This indicates an" + " outdated request.");
            }
        } else {
            log.debug("Could not find registration for resource id {}. Discarding the slot available" + "message {}.", resourceId, slotId);
        }
    } else {
        log.debug("Discarding notify slot available message for slot {}, because the " + "leader id {} did not match the expected leader id {}.", slotId, resourceManagerLeaderId, leaderSessionId);
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) InstanceID(org.apache.flink.runtime.instance.InstanceID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 2 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotManager method addFreeSlot.

/**
	 * Add free slots directly to the free pool, this will not trigger pending requests allocation
	 *
	 * @param slot The resource slot
	 */
@VisibleForTesting
void addFreeSlot(final ResourceSlot slot) {
    final ResourceID resourceId = slot.getResourceID();
    final SlotID slotId = slot.getSlotId();
    if (!registeredSlots.containsKey(resourceId)) {
        registeredSlots.put(resourceId, new HashMap<SlotID, ResourceSlot>());
    }
    registeredSlots.get(resourceId).put(slot.getSlotId(), slot);
    freeSlots.put(slotId, slot);
}
Also used : SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceSlot(org.apache.flink.runtime.clusterframework.types.ResourceSlot) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting)

Example 3 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class SlotManager method registerNewSlot.

/**
	 * Registers a new slot with the SlotManager.
	 *
	 * @param slot The ResourceSlot which will be registered
	 */
private void registerNewSlot(final ResourceSlot slot) {
    final SlotID slotId = slot.getSlotId();
    final ResourceID resourceId = slotId.getResourceID();
    if (!registeredSlots.containsKey(resourceId)) {
        registeredSlots.put(resourceId, new HashMap<SlotID, ResourceSlot>());
    }
    registeredSlots.get(resourceId).put(slotId, slot);
}
Also used : SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceSlot(org.apache.flink.runtime.clusterframework.types.ResourceSlot)

Example 4 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class FlinkResourceManager method jobManagerLeaderConnected.

/**
	 * Callback when we're informed about a new leading JobManager.
	 * @param newJobManagerLeader The ActorRef of the new jobManager
	 * @param workers The existing workers the JobManager has registered.
	 */
private void jobManagerLeaderConnected(ActorRef newJobManagerLeader, Collection<ResourceID> workers) {
    if (jobManager == null) {
        LOG.info("Resource Manager associating with leading JobManager {} - leader session {}", newJobManagerLeader, leaderSessionID);
        jobManager = newJobManagerLeader;
        if (workers.size() > 0) {
            LOG.info("Received TaskManagers that were registered at the leader JobManager. " + "Trying to consolidate.");
            // keep track of which TaskManagers are not handled
            Set<ResourceID> toHandle = new HashSet<>(workers.size());
            toHandle.addAll(workers);
            try {
                // ask the framework to tell us which ones we should keep for now
                Collection<WorkerType> consolidated = reacceptRegisteredWorkers(workers);
                LOG.info("Consolidated {} TaskManagers", consolidated.size());
                // put the consolidated TaskManagers into our bookkeeping
                for (WorkerType worker : consolidated) {
                    ResourceID resourceID = worker.getResourceID();
                    startedWorkers.put(resourceID, worker);
                    toHandle.remove(resourceID);
                }
            } catch (Throwable t) {
                LOG.error("Error during consolidation of known TaskManagers", t);
                // the framework should release the remaining unclear resources
                for (ResourceID id : toHandle) {
                    releasePendingWorker(id);
                }
            }
        }
        // trigger initial check for requesting new workers
        checkWorkersPool();
    } else {
        String msg = "Attempting to associate with new JobManager leader " + newJobManagerLeader + " without previously disassociating from current leader " + jobManager;
        fatalError(msg, new Exception(msg));
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) HashSet(java.util.HashSet)

Example 5 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class TaskExecutorTest method testJobLeaderDetection.

/**
	 * Tests that a TaskManager detects a job leader for which has reserved slots. Upon detecting
	 * the job leader, it will offer all reserved slots to the JobManager.
	 */
@Test
public void testJobLeaderDetection() throws Exception {
    final JobID jobId = new JobID();
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    final Configuration configuration = new Configuration();
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final ResourceID resourceId = new ResourceID("foobar");
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
    final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
    final TimerService<AllocationID> timerService = mock(TimerService.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class)), timerService);
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final TestingLeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
    final TestingLeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
    haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
    haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
    final String resourceManagerAddress = "rm";
    final UUID resourceManagerLeaderId = UUID.randomUUID();
    final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
    final InstanceID registrationId = new InstanceID();
    when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
    final String jobManagerAddress = "jm";
    final UUID jobManagerLeaderId = UUID.randomUUID();
    final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
    final int blobPort = 42;
    final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
    when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
    when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
    rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
    rpc.registerGateway(jobManagerAddress, jobMasterGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotID slotId = new SlotID(resourceId, 0);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, ResourceProfile.UNKNOWN);
    try {
        TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
        taskManager.start();
        // tell the task manager about the rm leader
        resourceManagerLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerLeaderId);
        // request slots from the task manager under the given allocation id
        TMSlotRequestReply reply = taskManager.requestSlot(slotId, jobId, allocationId, jobManagerAddress, resourceManagerLeaderId);
        // this is hopefully successful :-)
        assertTrue(reply instanceof TMSlotRequestRegistered);
        // now inform the task manager about the new job leader
        jobManagerLeaderRetrievalService.notifyListener(jobManagerAddress, jobManagerLeaderId);
        // the job leader should get the allocation id offered
        verify(jobMasterGateway).offerSlots(any(ResourceID.class), (Iterable<SlotOffer>) Matchers.argThat(contains(slotOffer)), eq(jobManagerLeaderId), any(Time.class));
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) InstanceID(org.apache.flink.runtime.instance.InstanceID) Time(org.apache.flink.api.common.time.Time) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)74 Test (org.junit.Test)48 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)25 Time (org.apache.flink.api.common.time.Time)18 UUID (java.util.UUID)16 JobID (org.apache.flink.api.common.JobID)16 Configuration (org.apache.flink.configuration.Configuration)14 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)13 JavaTestKit (akka.testkit.JavaTestKit)12 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)12 InetAddress (java.net.InetAddress)11 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)10 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)10 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)10 SlotRequest (org.apache.flink.runtime.resourcemanager.SlotRequest)10 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)9 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)9 TestingSerialRpcService (org.apache.flink.runtime.rpc.TestingSerialRpcService)9