Search in sources :

Example 61 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class TaskExecutorITCase method testSlotAllocation.

@Test
public void testSlotAllocation() throws Exception {
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
    final Configuration configuration = new Configuration();
    final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
    final ResourceID taskManagerResourceId = new ResourceID("foobar");
    final UUID rmLeaderId = UUID.randomUUID();
    final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
    final String rmAddress = "rm";
    final String jmAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
    testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
    testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
    TestingSerialRpcService rpcService = new TestingSerialRpcService();
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
    SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    MetricRegistry metricRegistry = mock(MetricRegistry.class);
    HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
    final MemoryManager memoryManager = mock(MemoryManager.class);
    final IOManager ioManager = mock(IOManager.class);
    final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
    final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
    final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
    final FileCache fileCache = mock(FileCache.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
    TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
    JobMasterGateway jmGateway = mock(JobMasterGateway.class);
    when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
    when(jmGateway.getHostname()).thenReturn(jmAddress);
    rpcService.registerGateway(rmAddress, resourceManager.getSelf());
    rpcService.registerGateway(jmAddress, jmGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
    try {
        resourceManager.start();
        taskExecutor.start();
        // notify the RM that it is the leader
        rmLeaderElectionService.isLeader(rmLeaderId);
        // notify the TM about the new RM leader
        rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
        Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
        RegistrationResponse registrationResponse = registrationResponseFuture.get();
        assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
        resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
        verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
    } finally {
        if (testingFatalErrorHandler.hasExceptionOccurred()) {
            testingFatalErrorHandler.rethrowError();
        }
    }
}
Also used : ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 62 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class TaskExecutorTest method testRejectAllocationRequestsForOutOfSyncSlots.

/**
	 * Tests that all allocation requests for slots are ignored if the slot has been reported as
	 * free by the TaskExecutor but this report hasn't been confirmed by the ResourceManager.
	 *
	 * This is essential for the correctness of the state of the ResourceManager.
	 */
@Ignore
@Test
public void testRejectAllocationRequestsForOutOfSyncSlots() throws Exception {
    final ResourceID resourceID = ResourceID.generate();
    final String address1 = "/resource/manager/address/one";
    final UUID leaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final String jobManagerAddress = "foobar";
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    try {
        // register the mock resource manager gateways
        ResourceManagerGateway rmGateway1 = mock(ResourceManagerGateway.class);
        rpc.registerGateway(address1, rmGateway1);
        TestingLeaderRetrievalService testLeaderService = new TestingLeaderRetrievalService();
        TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
        haServices.setResourceManagerLeaderRetriever(testLeaderService);
        TaskManagerConfiguration taskManagerServicesConfiguration = mock(TaskManagerConfiguration.class);
        when(taskManagerServicesConfiguration.getNumberSlots()).thenReturn(1);
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(resourceID);
        final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
        TaskExecutor taskManager = new TaskExecutor(taskManagerServicesConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), mock(TaskSlotTable.class), mock(JobManagerTable.class), mock(JobLeaderService.class), testingFatalErrorHandler);
        taskManager.start();
        String taskManagerAddress = taskManager.getAddress();
        // no connection initially, since there is no leader
        assertNull(taskManager.getResourceManagerConnection());
        // define a leader and see that a registration happens
        testLeaderService.notifyListener(address1, leaderId);
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        assertNotNull(taskManager.getResourceManagerConnection());
        // test that allocating a slot works
        final SlotID slotID = new SlotID(resourceID, 0);
        TMSlotRequestReply tmSlotRequestReply = taskManager.requestSlot(slotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply instanceof TMSlotRequestRegistered);
        // TODO: Figure out the concrete allocation behaviour between RM and TM. Maybe we don't need the SlotID...
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        final SlotID unconfirmedFreeSlotID = new SlotID(resourceID, 1);
        TMSlotRequestReply tmSlotRequestReply2 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply2 instanceof TMSlotRequestRejected);
        // re-register
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        testLeaderService.notifyListener(address1, leaderId);
        // now we should be successful because the slots status has been synced
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        TMSlotRequestReply tmSlotRequestReply3 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply3 instanceof TMSlotRequestRegistered);
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 63 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class Scheduler method getHostnamesFromInstances.

// ------------------------------------------------------------------------
//  Utilities
// ------------------------------------------------------------------------
private static String getHostnamesFromInstances(Iterable<TaskManagerLocation> locations) {
    StringBuilder bld = new StringBuilder();
    boolean successive = false;
    for (TaskManagerLocation loc : locations) {
        if (successive) {
            bld.append(", ");
        } else {
            successive = true;
        }
        bld.append(loc.getHostname());
    }
    return bld.toString();
}
Also used : TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation)

Example 64 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class Scheduler method findInstance.

/**
	 * Tries to find a requested instance. If no such instance is available it will return a non-
	 * local instance. If no such instance exists (all slots occupied), then return null.
	 * 
	 * <p><b>NOTE:</b> This method is not thread-safe, it needs to be synchronized by the caller.</p>
	 *
	 * @param requestedLocations The list of preferred instances. May be null or empty, which indicates that
	 *                           no locality preference exists.   
	 * @param localOnly Flag to indicate whether only one of the exact local instances can be chosen.  
	 */
private Pair<Instance, Locality> findInstance(Iterable<TaskManagerLocation> requestedLocations, boolean localOnly) {
    // drain the queue of newly available instances
    while (this.newlyAvailableInstances.size() > 0) {
        Instance queuedInstance = this.newlyAvailableInstances.poll();
        if (queuedInstance != null) {
            this.instancesWithAvailableResources.put(queuedInstance.getTaskManagerID(), queuedInstance);
        }
    }
    // if nothing is available at all, return null
    if (this.instancesWithAvailableResources.isEmpty()) {
        return null;
    }
    Iterator<TaskManagerLocation> locations = requestedLocations == null ? null : requestedLocations.iterator();
    if (locations != null && locations.hasNext()) {
        while (locations.hasNext()) {
            TaskManagerLocation location = locations.next();
            if (location != null) {
                Instance instance = instancesWithAvailableResources.remove(location.getResourceID());
                if (instance != null) {
                    return new ImmutablePair<Instance, Locality>(instance, Locality.LOCAL);
                }
            }
        }
        // no local instance available
        if (localOnly) {
            return null;
        } else {
            // take the first instance from the instances with resources
            Iterator<Instance> instances = instancesWithAvailableResources.values().iterator();
            Instance instanceToUse = instances.next();
            instances.remove();
            return new ImmutablePair<>(instanceToUse, Locality.NON_LOCAL);
        }
    } else {
        // no location preference, so use some instance
        Iterator<Instance> instances = instancesWithAvailableResources.values().iterator();
        Instance instanceToUse = instances.next();
        instances.remove();
        return new ImmutablePair<>(instanceToUse, Locality.UNCONSTRAINED);
    }
}
Also used : ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) Instance(org.apache.flink.runtime.instance.Instance) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation)

Example 65 with TaskManagerLocation

use of org.apache.flink.runtime.taskmanager.TaskManagerLocation in project flink by apache.

the class ExecutionGraphSchedulingTest method createSlot.

private SimpleSlot createSlot(TaskManagerGateway taskManager, JobID jobId, SlotOwner slotOwner) {
    TaskManagerLocation location = new TaskManagerLocation(ResourceID.generate(), InetAddress.getLoopbackAddress(), 12345);
    AllocatedSlot slot = new AllocatedSlot(new AllocationID(), jobId, location, 0, ResourceProfile.UNKNOWN, taskManager);
    return new SimpleSlot(slot, slotOwner, 0);
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot)

Aggregations

TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)84 Test (org.junit.Test)42 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)25 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)18 AccessExecutionVertex (org.apache.flink.runtime.executiongraph.AccessExecutionVertex)15 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)15 ArrayList (java.util.ArrayList)14 JobID (org.apache.flink.api.common.JobID)13 InetAddress (java.net.InetAddress)12 ExecutionException (java.util.concurrent.ExecutionException)12 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)12 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)12 Instance (org.apache.flink.runtime.instance.Instance)12 LocalTaskManagerLocation (org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation)11 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)10 HashMap (java.util.HashMap)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 Collection (java.util.Collection)8 SchedulerTestUtils.getRandomInstance (org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance)8 List (java.util.List)7