Search in sources :

Example 11 with TestingFatalErrorHandler

use of org.apache.flink.runtime.util.TestingFatalErrorHandler in project flink by apache.

the class JobMasterPartitionReleaseTest method testPartitionReleaseOrPromotionOnJobTermination.

private void testPartitionReleaseOrPromotionOnJobTermination(Function<TestSetup, CompletableFuture<Collection<ResultPartitionID>>> callSelector, ExecutionState finalExecutionState) throws Exception {
    final CompletableFuture<TaskDeploymentDescriptor> taskDeploymentDescriptorFuture = new CompletableFuture<>();
    final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setSubmitTaskConsumer((tdd, ignored) -> {
        taskDeploymentDescriptorFuture.complete(tdd);
        return CompletableFuture.completedFuture(Acknowledge.get());
    }).createTestingTaskExecutorGateway();
    try (final TestSetup testSetup = new TestSetup(rpcService, testingFatalErrorHandler, testingTaskExecutorGateway)) {
        ResultPartitionID partitionID0 = new ResultPartitionID();
        ResultPartitionID partitionID1 = new ResultPartitionID();
        testSetup.getPartitionTracker().setGetAllTrackedPartitionsSupplier(() -> {
            ResultPartitionDeploymentDescriptor partitionDesc0 = AbstractPartitionTrackerTest.createResultPartitionDeploymentDescriptor(partitionID0, true);
            ResultPartitionDeploymentDescriptor partitionDesc1 = AbstractPartitionTrackerTest.createResultPartitionDeploymentDescriptor(partitionID1, false);
            return Arrays.asList(partitionDesc0, partitionDesc1);
        });
        final JobMasterGateway jobMasterGateway = testSetup.getJobMasterGateway();
        // update the execution state of the only execution to target state
        // this should trigger the job to finish
        final TaskDeploymentDescriptor taskDeploymentDescriptor = taskDeploymentDescriptorFuture.get();
        jobMasterGateway.updateTaskExecutionState(new TaskExecutionState(taskDeploymentDescriptor.getExecutionAttemptId(), finalExecutionState));
        assertThat(callSelector.apply(testSetup).get(), containsInAnyOrder(partitionID0, partitionID1));
    }
}
Also used : TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Arrays(java.util.Arrays) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) ClassRule(org.junit.ClassRule) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) AfterClass(org.junit.AfterClass) Collection(java.util.Collection) AbstractPartitionTrackerTest(org.apache.flink.runtime.io.network.partition.AbstractPartitionTrackerTest) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) BeforeClass(org.junit.BeforeClass) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Before(org.junit.Before) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) Configuration(org.apache.flink.configuration.Configuration) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Test(org.junit.Test) IOException(java.io.IOException) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) CompletableFuture(java.util.concurrent.CompletableFuture) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState)

Example 12 with TestingFatalErrorHandler

use of org.apache.flink.runtime.util.TestingFatalErrorHandler in project flink by apache.

the class TaskExecutorITCase method testSlotAllocation.

@Test
public void testSlotAllocation() throws Exception {
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
    final Configuration configuration = new Configuration();
    final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
    final ResourceID taskManagerResourceId = new ResourceID("foobar");
    final UUID rmLeaderId = UUID.randomUUID();
    final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
    final String rmAddress = "rm";
    final String jmAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
    testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
    testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
    TestingSerialRpcService rpcService = new TestingSerialRpcService();
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
    SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    MetricRegistry metricRegistry = mock(MetricRegistry.class);
    HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
    final MemoryManager memoryManager = mock(MemoryManager.class);
    final IOManager ioManager = mock(IOManager.class);
    final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
    final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
    final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
    final FileCache fileCache = mock(FileCache.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
    TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
    JobMasterGateway jmGateway = mock(JobMasterGateway.class);
    when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
    when(jmGateway.getHostname()).thenReturn(jmAddress);
    rpcService.registerGateway(rmAddress, resourceManager.getSelf());
    rpcService.registerGateway(jmAddress, jmGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
    try {
        resourceManager.start();
        taskExecutor.start();
        // notify the RM that it is the leader
        rmLeaderElectionService.isLeader(rmLeaderId);
        // notify the TM about the new RM leader
        rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
        Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
        RegistrationResponse registrationResponse = registrationResponseFuture.get();
        assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
        resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
        verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
    } finally {
        if (testingFatalErrorHandler.hasExceptionOccurred()) {
            testingFatalErrorHandler.rethrowError();
        }
    }
}
Also used : ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 13 with TestingFatalErrorHandler

use of org.apache.flink.runtime.util.TestingFatalErrorHandler in project flink by apache.

the class TaskExecutorTest method testTaskSubmission.

/**
	 * Tests that we can submit a task to the TaskManager given that we've allocated a slot there.
	 */
@Test(timeout = 1000L)
public void testTaskSubmission() throws Exception {
    final Configuration configuration = new Configuration();
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final JobID jobId = new JobID();
    final AllocationID allocationId = new AllocationID();
    final UUID jobManagerLeaderId = UUID.randomUUID();
    final JobVertexID jobVertexId = new JobVertexID();
    JobInformation jobInformation = new JobInformation(jobId, name.getMethodName(), new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
    TaskInformation taskInformation = new TaskInformation(jobVertexId, "test task", 1, 1, TestInvokable.class.getName(), new Configuration());
    SerializedValue<JobInformation> serializedJobInformation = new SerializedValue<>(jobInformation);
    SerializedValue<TaskInformation> serializedJobVertexInformation = new SerializedValue<>(taskInformation);
    final TaskDeploymentDescriptor tdd = new TaskDeploymentDescriptor(serializedJobInformation, serializedJobVertexInformation, new ExecutionAttemptID(), allocationId, 0, 0, 0, null, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList());
    final LibraryCacheManager libraryCacheManager = mock(LibraryCacheManager.class);
    when(libraryCacheManager.getClassLoader(eq(jobId))).thenReturn(getClass().getClassLoader());
    final JobManagerConnection jobManagerConnection = new JobManagerConnection(jobId, ResourceID.generate(), mock(JobMasterGateway.class), jobManagerLeaderId, mock(TaskManagerActions.class), mock(CheckpointResponder.class), libraryCacheManager, mock(ResultPartitionConsumableNotifier.class), mock(PartitionProducerStateChecker.class));
    final JobManagerTable jobManagerTable = new JobManagerTable();
    jobManagerTable.put(jobId, jobManagerConnection);
    final TaskSlotTable taskSlotTable = mock(TaskSlotTable.class);
    when(taskSlotTable.existsActiveSlot(eq(jobId), eq(allocationId))).thenReturn(true);
    when(taskSlotTable.addTask(any(Task.class))).thenReturn(true);
    final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
    when(networkEnvironment.createKvStateTaskRegistry(eq(jobId), eq(jobVertexId))).thenReturn(mock(TaskKvStateRegistry.class));
    final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
    when(taskManagerMetricGroup.addTaskForJob(any(JobID.class), anyString(), any(JobVertexID.class), any(ExecutionAttemptID.class), anyString(), anyInt(), anyInt())).thenReturn(mock(TaskMetricGroup.class));
    final HighAvailabilityServices haServices = mock(HighAvailabilityServices.class);
    when(haServices.getResourceManagerLeaderRetriever()).thenReturn(mock(LeaderRetrievalService.class));
    try {
        final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
        TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, mock(TaskManagerLocation.class), rpc, mock(MemoryManager.class), mock(IOManager.class), networkEnvironment, haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), taskManagerMetricGroup, mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, mock(JobLeaderService.class), testingFatalErrorHandler);
        taskManager.start();
        taskManager.submitTask(tdd, jobManagerLeaderId);
        Future<Boolean> completionFuture = TestInvokable.completableFuture;
        completionFuture.get();
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) Configuration(org.apache.flink.configuration.Configuration) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) PartitionProducerStateChecker(org.apache.flink.runtime.io.network.netty.PartitionProducerStateChecker) UUID(java.util.UUID) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) TaskMetricGroup(org.apache.flink.runtime.metrics.groups.TaskMetricGroup) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) SerializedValue(org.apache.flink.util.SerializedValue) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 14 with TestingFatalErrorHandler

use of org.apache.flink.runtime.util.TestingFatalErrorHandler in project flink by apache.

the class TaskExecutorTest method testRejectAllocationRequestsForOutOfSyncSlots.

/**
	 * Tests that all allocation requests for slots are ignored if the slot has been reported as
	 * free by the TaskExecutor but this report hasn't been confirmed by the ResourceManager.
	 *
	 * This is essential for the correctness of the state of the ResourceManager.
	 */
@Ignore
@Test
public void testRejectAllocationRequestsForOutOfSyncSlots() throws Exception {
    final ResourceID resourceID = ResourceID.generate();
    final String address1 = "/resource/manager/address/one";
    final UUID leaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final String jobManagerAddress = "foobar";
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    try {
        // register the mock resource manager gateways
        ResourceManagerGateway rmGateway1 = mock(ResourceManagerGateway.class);
        rpc.registerGateway(address1, rmGateway1);
        TestingLeaderRetrievalService testLeaderService = new TestingLeaderRetrievalService();
        TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
        haServices.setResourceManagerLeaderRetriever(testLeaderService);
        TaskManagerConfiguration taskManagerServicesConfiguration = mock(TaskManagerConfiguration.class);
        when(taskManagerServicesConfiguration.getNumberSlots()).thenReturn(1);
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(resourceID);
        final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
        TaskExecutor taskManager = new TaskExecutor(taskManagerServicesConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), mock(TaskSlotTable.class), mock(JobManagerTable.class), mock(JobLeaderService.class), testingFatalErrorHandler);
        taskManager.start();
        String taskManagerAddress = taskManager.getAddress();
        // no connection initially, since there is no leader
        assertNull(taskManager.getResourceManagerConnection());
        // define a leader and see that a registration happens
        testLeaderService.notifyListener(address1, leaderId);
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        assertNotNull(taskManager.getResourceManagerConnection());
        // test that allocating a slot works
        final SlotID slotID = new SlotID(resourceID, 0);
        TMSlotRequestReply tmSlotRequestReply = taskManager.requestSlot(slotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply instanceof TMSlotRequestRegistered);
        // TODO: Figure out the concrete allocation behaviour between RM and TM. Maybe we don't need the SlotID...
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        final SlotID unconfirmedFreeSlotID = new SlotID(resourceID, 1);
        TMSlotRequestReply tmSlotRequestReply2 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply2 instanceof TMSlotRequestRejected);
        // re-register
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        testLeaderService.notifyListener(address1, leaderId);
        // now we should be successful because the slots status has been synced
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        TMSlotRequestReply tmSlotRequestReply3 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply3 instanceof TMSlotRequestRegistered);
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 15 with TestingFatalErrorHandler

use of org.apache.flink.runtime.util.TestingFatalErrorHandler in project flink by apache.

the class JobMasterTest method testHeartbeatTimeoutWithTaskManager.

@Test
public void testHeartbeatTimeoutWithTaskManager() throws Exception {
    final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
    haServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    haServices.setCheckpointRecoveryFactory(mock(CheckpointRecoveryFactory.class));
    final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    final String jobManagerAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
    final String taskManagerAddress = "tm";
    final ResourceID tmResourceId = new ResourceID(taskManagerAddress);
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(tmResourceId, InetAddress.getLoopbackAddress(), 1234);
    final TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    rpc.registerGateway(taskManagerAddress, taskExecutorGateway);
    final long heartbeatInterval = 1L;
    final long heartbeatTimeout = 5L;
    final ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    final HeartbeatServices heartbeatServices = new TestingHeartbeatServices(heartbeatInterval, heartbeatTimeout, scheduledExecutor);
    final JobGraph jobGraph = new JobGraph();
    try {
        final JobMaster jobMaster = new JobMaster(jmResourceId, jobGraph, new Configuration(), rpc, haServices, heartbeatServices, Executors.newScheduledThreadPool(1), mock(BlobLibraryCacheManager.class), mock(RestartStrategyFactory.class), Time.of(10, TimeUnit.SECONDS), null, mock(OnCompletionActions.class), testingFatalErrorHandler, new FlinkUserCodeClassLoader(new URL[0]));
        // also start the heartbeat manager in job manager
        jobMaster.start(jmLeaderId);
        // register task manager will trigger monitoring heartbeat target, schedule heartbeat request in interval time
        jobMaster.registerTaskManager(taskManagerAddress, taskManagerLocation, jmLeaderId);
        ArgumentCaptor<Runnable> heartbeatRunnableCaptor = ArgumentCaptor.forClass(Runnable.class);
        verify(scheduledExecutor, times(1)).scheduleAtFixedRate(heartbeatRunnableCaptor.capture(), eq(0L), eq(heartbeatInterval), eq(TimeUnit.MILLISECONDS));
        Runnable heartbeatRunnable = heartbeatRunnableCaptor.getValue();
        ArgumentCaptor<Runnable> timeoutRunnableCaptor = ArgumentCaptor.forClass(Runnable.class);
        verify(scheduledExecutor).schedule(timeoutRunnableCaptor.capture(), eq(heartbeatTimeout), eq(TimeUnit.MILLISECONDS));
        Runnable timeoutRunnable = timeoutRunnableCaptor.getValue();
        // run the first heartbeat request
        heartbeatRunnable.run();
        verify(taskExecutorGateway, times(1)).heartbeatFromJobManager(eq(jmResourceId));
        // run the timeout runnable to simulate a heartbeat timeout
        timeoutRunnable.run();
        verify(taskExecutorGateway).disconnectJobManager(eq(jobGraph.getJobID()), any(TimeoutException.class));
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FlinkUserCodeClassLoader(org.apache.flink.runtime.execution.librarycache.FlinkUserCodeClassLoader) URL(java.net.URL) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) UUID(java.util.UUID) TimeoutException(java.util.concurrent.TimeoutException) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) OnCompletionActions(org.apache.flink.runtime.jobmanager.OnCompletionActions) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) RestartStrategyFactory(org.apache.flink.runtime.executiongraph.restart.RestartStrategyFactory) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Aggregations

TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)24 Test (org.junit.Test)15 Configuration (org.apache.flink.configuration.Configuration)12 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)11 Before (org.junit.Before)10 JobID (org.apache.flink.api.common.JobID)8 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)8 TestingLeaderElectionService (org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)7 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)6 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)6 UUID (java.util.UUID)5 CompletableFuture (java.util.concurrent.CompletableFuture)5 Time (org.apache.flink.api.common.time.Time)5 SettableLeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService)5 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)5 StandaloneCheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory)4 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)4 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)4 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)4 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)4