Search in sources :

Example 1 with SlotOffer

use of org.apache.flink.runtime.taskexecutor.slot.SlotOffer in project flink by apache.

the class TaskExecutor method offerSlotsToJobManager.

// ------------------------------------------------------------------------
//  Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
    final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
    if (jobManagerConnection == null) {
        log.debug("There is no job manager connection to the leader of job {}.", jobId);
    } else {
        if (taskSlotTable.hasAllocatedSlots(jobId)) {
            log.info("Offer reserved slots to the leader of job {}.", jobId);
            final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
            final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
            final UUID leaderId = jobManagerConnection.getLeaderId();
            final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
            while (reservedSlotsIterator.hasNext()) {
                SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
                try {
                    if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
                        // the slot is either free or releasing at the moment
                        final String message = "Could not mark slot " + jobId + " active.";
                        log.debug(message);
                        jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    }
                } catch (SlotNotFoundException e) {
                    final String message = "Could not mark slot " + jobId + " active.";
                    jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    continue;
                }
                reservedSlots.add(offer);
            }
            Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
            acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {

                @Override
                public void accept(Iterable<SlotOffer> acceptedSlots) {
                    // check if the response is still valid
                    if (isJobManagerConnectionValid(jobId, leaderId)) {
                        // mark accepted slots active
                        for (SlotOffer acceptedSlot : acceptedSlots) {
                            reservedSlots.remove(acceptedSlot);
                        }
                        final Exception e = new Exception("The slot was rejected by the JobManager.");
                        for (SlotOffer rejectedSlot : reservedSlots) {
                            freeSlot(rejectedSlot.getAllocationId(), e);
                        }
                    } else {
                        // discard the response since there is a new leader for the job
                        log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
                    }
                }
            }, getMainThreadExecutor());
            acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {

                @Override
                public Void apply(Throwable throwable) {
                    if (throwable instanceof TimeoutException) {
                        // We ran into a timeout. Try again.
                        offerSlotsToJobManager(jobId);
                    } else {
                        // We encountered an exception. Free the slots and return them to the RM.
                        for (SlotOffer reservedSlot : reservedSlots) {
                            freeSlot(reservedSlot.getAllocationId(), throwable);
                        }
                    }
                    return null;
                }
            }, getMainThreadExecutor());
        } else {
            log.debug("There are no unassigned slots for the job {}.", jobId);
        }
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskSlot(org.apache.flink.runtime.taskexecutor.slot.TaskSlot) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) UUID(java.util.UUID) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException)

Example 2 with SlotOffer

use of org.apache.flink.runtime.taskexecutor.slot.SlotOffer in project flink by apache.

the class TaskExecutorTest method testJobLeaderDetection.

/**
	 * Tests that a TaskManager detects a job leader for which has reserved slots. Upon detecting
	 * the job leader, it will offer all reserved slots to the JobManager.
	 */
@Test
public void testJobLeaderDetection() throws Exception {
    final JobID jobId = new JobID();
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    final Configuration configuration = new Configuration();
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final ResourceID resourceId = new ResourceID("foobar");
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
    final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
    final TimerService<AllocationID> timerService = mock(TimerService.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class)), timerService);
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final TestingLeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
    final TestingLeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService();
    haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
    haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
    final String resourceManagerAddress = "rm";
    final UUID resourceManagerLeaderId = UUID.randomUUID();
    final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
    final InstanceID registrationId = new InstanceID();
    when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
    final String jobManagerAddress = "jm";
    final UUID jobManagerLeaderId = UUID.randomUUID();
    final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
    final int blobPort = 42;
    final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
    when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
    when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
    rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
    rpc.registerGateway(jobManagerAddress, jobMasterGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotID slotId = new SlotID(resourceId, 0);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, ResourceProfile.UNKNOWN);
    try {
        TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
        taskManager.start();
        // tell the task manager about the rm leader
        resourceManagerLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerLeaderId);
        // request slots from the task manager under the given allocation id
        TMSlotRequestReply reply = taskManager.requestSlot(slotId, jobId, allocationId, jobManagerAddress, resourceManagerLeaderId);
        // this is hopefully successful :-)
        assertTrue(reply instanceof TMSlotRequestRegistered);
        // now inform the task manager about the new job leader
        jobManagerLeaderRetrievalService.notifyListener(jobManagerAddress, jobManagerLeaderId);
        // the job leader should get the allocation id offered
        verify(jobMasterGateway).offerSlots(any(ResourceID.class), (Iterable<SlotOffer>) Matchers.argThat(contains(slotOffer)), eq(jobManagerLeaderId), any(Time.class));
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) InstanceID(org.apache.flink.runtime.instance.InstanceID) Time(org.apache.flink.api.common.time.Time) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with SlotOffer

use of org.apache.flink.runtime.taskexecutor.slot.SlotOffer in project flink by apache.

the class TaskExecutorTest method testSubmitTaskBeforeAcceptSlot.

/**
	 * This tests task executor receive SubmitTask before OfferSlot response.
	 */
@Test
public void testSubmitTaskBeforeAcceptSlot() throws Exception {
    final JobID jobId = new JobID();
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    final Configuration configuration = new Configuration();
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final ResourceID resourceId = new ResourceID("foobar");
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(resourceId, InetAddress.getLoopbackAddress(), 1234);
    final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
    final TimerService<AllocationID> timerService = mock(TimerService.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(mock(ResourceProfile.class), mock(ResourceProfile.class)), timerService);
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final String resourceManagerAddress = "rm";
    final UUID resourceManagerLeaderId = UUID.randomUUID();
    final String jobManagerAddress = "jm";
    final UUID jobManagerLeaderId = UUID.randomUUID();
    final LeaderRetrievalService resourceManagerLeaderRetrievalService = new TestingLeaderRetrievalService(resourceManagerAddress, resourceManagerLeaderId);
    final LeaderRetrievalService jobManagerLeaderRetrievalService = new TestingLeaderRetrievalService(jobManagerAddress, jobManagerLeaderId);
    haServices.setResourceManagerLeaderRetriever(resourceManagerLeaderRetrievalService);
    haServices.setJobMasterLeaderRetriever(jobId, jobManagerLeaderRetrievalService);
    final ResourceManagerGateway resourceManagerGateway = mock(ResourceManagerGateway.class);
    final InstanceID registrationId = new InstanceID();
    when(resourceManagerGateway.registerTaskExecutor(eq(resourceManagerLeaderId), any(String.class), eq(resourceId), any(SlotReport.class), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new TaskExecutorRegistrationSuccess(registrationId, 1000L)));
    final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
    final int blobPort = 42;
    final AllocationID allocationId1 = new AllocationID();
    final AllocationID allocationId2 = new AllocationID();
    final SlotOffer offer1 = new SlotOffer(allocationId1, 0, ResourceProfile.UNKNOWN);
    final JobMasterGateway jobMasterGateway = mock(JobMasterGateway.class);
    when(jobMasterGateway.registerTaskManager(any(String.class), eq(taskManagerLocation), eq(jobManagerLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(jmResourceId, blobPort)));
    when(jobMasterGateway.getHostname()).thenReturn(jobManagerAddress);
    rpc.registerGateway(resourceManagerAddress, resourceManagerGateway);
    rpc.registerGateway(jobManagerAddress, jobMasterGateway);
    final LibraryCacheManager libraryCacheManager = mock(LibraryCacheManager.class);
    when(libraryCacheManager.getClassLoader(eq(jobId))).thenReturn(getClass().getClassLoader());
    final JobManagerConnection jobManagerConnection = new JobManagerConnection(jobId, jmResourceId, jobMasterGateway, jobManagerLeaderId, mock(TaskManagerActions.class), mock(CheckpointResponder.class), libraryCacheManager, mock(ResultPartitionConsumableNotifier.class), mock(PartitionProducerStateChecker.class));
    jobManagerTable.put(jobId, jobManagerConnection);
    try {
        final TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
        taskManager.start();
        taskSlotTable.allocateSlot(0, jobId, allocationId1, Time.milliseconds(10000L));
        taskSlotTable.allocateSlot(1, jobId, allocationId2, Time.milliseconds(10000L));
        final JobVertexID jobVertexId = new JobVertexID();
        JobInformation jobInformation = new JobInformation(jobId, name.getMethodName(), new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
        TaskInformation taskInformation = new TaskInformation(jobVertexId, "test task", 1, 1, TestInvokable.class.getName(), new Configuration());
        SerializedValue<JobInformation> serializedJobInformation = new SerializedValue<>(jobInformation);
        SerializedValue<TaskInformation> serializedJobVertexInformation = new SerializedValue<>(taskInformation);
        final TaskDeploymentDescriptor tdd = new TaskDeploymentDescriptor(serializedJobInformation, serializedJobVertexInformation, new ExecutionAttemptID(), allocationId1, 0, 0, 0, null, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList());
        CompletableFuture<Iterable<SlotOffer>> offerResultFuture = new FlinkCompletableFuture<>();
        // submit task first and then return acceptance response
        when(jobMasterGateway.offerSlots(any(ResourceID.class), any(Iterable.class), eq(jobManagerLeaderId), any(Time.class))).thenReturn(offerResultFuture);
        // we have to add the job after the TaskExecutor, because otherwise the service has not
        // been properly started. This will also offer the slots to the job master
        jobLeaderService.addJob(jobId, jobManagerAddress);
        verify(jobMasterGateway).offerSlots(any(ResourceID.class), any(Iterable.class), eq(jobManagerLeaderId), any(Time.class));
        // submit the task without having acknowledge the offered slots
        taskManager.submitTask(tdd, jobManagerLeaderId);
        // acknowledge the offered slots
        offerResultFuture.complete(Collections.singleton(offer1));
        verify(resourceManagerGateway).notifySlotAvailable(eq(resourceManagerLeaderId), eq(registrationId), eq(new SlotID(resourceId, 1)));
        assertTrue(taskSlotTable.existsActiveSlot(jobId, allocationId1));
        assertFalse(taskSlotTable.existsActiveSlot(jobId, allocationId2));
        assertTrue(taskSlotTable.isSlotFree(1));
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) InstanceID(org.apache.flink.runtime.instance.InstanceID) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) PartitionProducerStateChecker(org.apache.flink.runtime.io.network.netty.PartitionProducerStateChecker) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) FileCache(org.apache.flink.runtime.filecache.FileCache) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) SerializedValue(org.apache.flink.util.SerializedValue) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) Test(org.junit.Test)

Example 4 with SlotOffer

use of org.apache.flink.runtime.taskexecutor.slot.SlotOffer in project flink by apache.

the class JobMaster method offerSlots.

@RpcMethod
public Future<Iterable<SlotOffer>> offerSlots(final ResourceID taskManagerId, final Iterable<SlotOffer> slots, final UUID leaderId) throws Exception {
    validateLeaderSessionId(leaderId);
    Tuple2<TaskManagerLocation, TaskExecutorGateway> taskManager = registeredTaskManagers.get(taskManagerId);
    if (taskManager == null) {
        throw new Exception("Unknown TaskManager " + taskManagerId);
    }
    final JobID jid = jobGraph.getJobID();
    final TaskManagerLocation taskManagerLocation = taskManager.f0;
    final TaskExecutorGateway taskExecutorGateway = taskManager.f1;
    final ArrayList<Tuple2<AllocatedSlot, SlotOffer>> slotsAndOffers = new ArrayList<>();
    final RpcTaskManagerGateway rpcTaskManagerGateway = new RpcTaskManagerGateway(taskExecutorGateway, leaderId);
    for (SlotOffer slotOffer : slots) {
        final AllocatedSlot slot = new AllocatedSlot(slotOffer.getAllocationId(), jid, taskManagerLocation, slotOffer.getSlotIndex(), slotOffer.getResourceProfile(), rpcTaskManagerGateway);
        slotsAndOffers.add(new Tuple2<>(slot, slotOffer));
    }
    return slotPoolGateway.offerSlots(slotsAndOffers);
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) Tuple2(org.apache.flink.api.java.tuple.Tuple2) ArrayList(java.util.ArrayList) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TimeoutException(java.util.concurrent.TimeoutException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) JobID(org.apache.flink.api.common.JobID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 5 with SlotOffer

use of org.apache.flink.runtime.taskexecutor.slot.SlotOffer in project flink by apache.

the class TaskExecutorITCase method testSlotAllocation.

@Test
public void testSlotAllocation() throws Exception {
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
    final Configuration configuration = new Configuration();
    final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
    final ResourceID taskManagerResourceId = new ResourceID("foobar");
    final UUID rmLeaderId = UUID.randomUUID();
    final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
    final String rmAddress = "rm";
    final String jmAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
    testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
    testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
    TestingSerialRpcService rpcService = new TestingSerialRpcService();
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
    SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    MetricRegistry metricRegistry = mock(MetricRegistry.class);
    HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
    final MemoryManager memoryManager = mock(MemoryManager.class);
    final IOManager ioManager = mock(IOManager.class);
    final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
    final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
    final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
    final FileCache fileCache = mock(FileCache.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
    TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
    JobMasterGateway jmGateway = mock(JobMasterGateway.class);
    when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
    when(jmGateway.getHostname()).thenReturn(jmAddress);
    rpcService.registerGateway(rmAddress, resourceManager.getSelf());
    rpcService.registerGateway(jmAddress, jmGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
    try {
        resourceManager.start();
        taskExecutor.start();
        // notify the RM that it is the leader
        rmLeaderElectionService.isLeader(rmLeaderId);
        // notify the TM about the new RM leader
        rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
        Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
        RegistrationResponse registrationResponse = registrationResponseFuture.get();
        assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
        resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
        verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
    } finally {
        if (testingFatalErrorHandler.hasExceptionOccurred()) {
            testingFatalErrorHandler.rethrowError();
        }
    }
}
Also used : ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

SlotOffer (org.apache.flink.runtime.taskexecutor.slot.SlotOffer)6 UUID (java.util.UUID)5 JobID (org.apache.flink.api.common.JobID)5 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)5 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)5 Time (org.apache.flink.api.common.time.Time)4 Configuration (org.apache.flink.configuration.Configuration)4 BroadcastVariableManager (org.apache.flink.runtime.broadcast.BroadcastVariableManager)4 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 FileCache (org.apache.flink.runtime.filecache.FileCache)4 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)4 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)4 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)4 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)4 JMTMRegistrationSuccess (org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess)4 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)4 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)4 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)4 TaskManagerMetricGroup (org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup)4