Search in sources :

Example 11 with ResourceManagerId

use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.

the class TaskExecutorTest method testRemoveJobFromJobLeaderService.

/**
 * Tests that a job is removed from the JobLeaderService once a TaskExecutor has no more slots
 * assigned to this job.
 *
 * <p>See FLINK-8504
 */
@Test
public void testRemoveJobFromJobLeaderService() throws Exception {
    final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
    final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
    final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
    try {
        final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
        final CompletableFuture<Void> initialSlotReport = new CompletableFuture<>();
        resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
            initialSlotReport.complete(null);
            return CompletableFuture.completedFuture(Acknowledge.get());
        });
        final ResourceManagerId resourceManagerId = resourceManagerGateway.getFencingToken();
        rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
        resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerId.toUUID());
        final CompletableFuture<LeaderRetrievalListener> startFuture = new CompletableFuture<>();
        final CompletableFuture<Void> stopFuture = new CompletableFuture<>();
        final StartStopNotifyingLeaderRetrievalService jobMasterLeaderRetriever = new StartStopNotifyingLeaderRetrievalService(startFuture, stopFuture);
        haServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
        taskExecutor.start();
        taskExecutor.waitUntilStarted();
        final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
        final SlotID slotId = buildSlotID(0);
        final AllocationID allocationId = new AllocationID();
        assertThat(startFuture.isDone(), is(false));
        final JobLeaderService jobLeaderService = taskManagerServices.getJobLeaderService();
        assertThat(jobLeaderService.containsJob(jobId), is(false));
        // wait for the initial slot report
        initialSlotReport.get();
        requestSlot(taskExecutorGateway, jobId, allocationId, slotId, ResourceProfile.ZERO, "foobar", resourceManagerId);
        // wait until the job leader retrieval service for jobId is started
        startFuture.get();
        assertThat(jobLeaderService.containsJob(jobId), is(true));
        taskExecutorGateway.freeSlot(allocationId, new FlinkException("Test exception"), timeout).get();
        // wait that the job leader retrieval service for jobId stopped becaue it should get
        // removed
        stopFuture.get();
        assertThat(jobLeaderService.containsJob(jobId), is(false));
    } finally {
        RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
    }
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) FlinkException(org.apache.flink.util.FlinkException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) LeaderRetrievalListener(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener) Test(org.junit.Test)

Example 12 with ResourceManagerId

use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.

the class TaskExecutorTest method testDynamicSlotAllocation.

@Test
public void testDynamicSlotAllocation() throws Exception {
    final AllocationID allocationId = new AllocationID();
    try (TaskExecutorTestingContext submissionContext = createTaskExecutorTestingContext(2)) {
        submissionContext.start();
        final CompletableFuture<Tuple3<ResourceID, InstanceID, SlotReport>> initialSlotReportFuture = new CompletableFuture<>();
        ResourceManagerId resourceManagerId = createAndRegisterResourceManager(initialSlotReportFuture);
        initialSlotReportFuture.get();
        final ResourceProfile resourceProfile = DEFAULT_RESOURCE_PROFILE.merge(ResourceProfile.newBuilder().setCpuCores(0.1).build());
        TaskExecutorGateway selfGateway = submissionContext.taskExecutor.getSelfGateway(TaskExecutorGateway.class);
        requestSlot(selfGateway, jobId, allocationId, SlotID.getDynamicSlotID(ResourceID.generate()), resourceProfile, submissionContext.jobMasterGateway.getAddress(), resourceManagerId);
        ResourceID resourceId = ResourceID.generate();
        SlotReport slotReport = submissionContext.taskSlotTable.createSlotReport(resourceId);
        assertThat(slotReport, containsInAnyOrder(new SlotStatus(new SlotID(resourceId, 0), DEFAULT_RESOURCE_PROFILE), new SlotStatus(new SlotID(resourceId, 1), DEFAULT_RESOURCE_PROFILE), new SlotStatus(new SlotID(resourceId, 2), resourceProfile, jobId, allocationId)));
    }
}
Also used : ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TaskSlotUtils.createTotalResourceProfile(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils.createTotalResourceProfile) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Tuple3(org.apache.flink.api.java.tuple.Tuple3) AllocatedSlotReport(org.apache.flink.runtime.jobmaster.AllocatedSlotReport) Test(org.junit.Test)

Example 13 with ResourceManagerId

use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.

the class TaskExecutorTest method runResourceManagerHeartbeatTest.

private void runResourceManagerHeartbeatTest(HeartbeatServices heartbeatServices, Consumer<TestingResourceManagerGateway> setupResourceManagerGateway, TriConsumerWithException<TaskExecutorGateway, ResourceID, CompletableFuture<ResourceID>, Exception> heartbeatAction) throws Exception {
    final String rmAddress = "rm";
    final ResourceID rmResourceId = new ResourceID(rmAddress);
    final ResourceManagerId rmLeaderId = ResourceManagerId.generate();
    TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway(rmLeaderId, rmResourceId, rmAddress, rmAddress);
    final TaskExecutorRegistrationSuccess registrationResponse = new TaskExecutorRegistrationSuccess(new InstanceID(), rmResourceId, new ClusterInformation("localhost", 1234));
    final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>(2);
    registrationResponses.add(CompletableFuture.completedFuture(registrationResponse));
    registrationResponses.add(new CompletableFuture<>());
    final CompletableFuture<ResourceID> taskExecutorRegistrationFuture = new CompletableFuture<>();
    final CountDownLatch registrationAttempts = new CountDownLatch(2);
    rmGateway.setRegisterTaskExecutorFunction(registration -> {
        taskExecutorRegistrationFuture.complete(registration.getResourceId());
        registrationAttempts.countDown();
        return registrationResponses.poll();
    });
    setupResourceManagerGateway.accept(rmGateway);
    final CompletableFuture<ResourceID> taskExecutorDisconnectFuture = new CompletableFuture<>();
    rmGateway.setDisconnectTaskExecutorConsumer(disconnectInfo -> taskExecutorDisconnectFuture.complete(disconnectInfo.f0));
    rpc.registerGateway(rmAddress, rmGateway);
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build();
    final TaskExecutor taskManager = createTaskExecutor(taskManagerServices, heartbeatServices);
    try {
        taskManager.start();
        final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
        // define a leader and see that a registration happens
        resourceManagerLeaderRetriever.notifyListener(rmAddress, rmLeaderId.toUUID());
        // register resource manager success will trigger monitoring heartbeat target between tm
        // and rm
        assertThat(taskExecutorRegistrationFuture.get(), equalTo(unresolvedTaskManagerLocation.getResourceID()));
        heartbeatAction.accept(taskExecutorGateway, rmGateway.getOwnResourceId(), taskExecutorDisconnectFuture);
        // heartbeat timeout should trigger disconnect TaskManager from ResourceManager
        assertThat(taskExecutorDisconnectFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS), equalTo(unresolvedTaskManagerLocation.getResourceID()));
        assertTrue("The TaskExecutor should try to reconnect to the RM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
    } finally {
        RpcUtils.terminateRpcEndpoint(taskManager, timeout);
    }
}
Also used : InstanceID(org.apache.flink.runtime.instance.InstanceID) Matchers.containsString(org.hamcrest.Matchers.containsString) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) ArrayDeque(java.util.ArrayDeque) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway)

Example 14 with ResourceManagerId

use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.

the class JobMasterTest method testResourceManagerBecomesUnreachableTriggersDisconnect.

@Test
public void testResourceManagerBecomesUnreachableTriggersDisconnect() throws Exception {
    final String resourceManagerAddress = "rm";
    final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
    final ResourceID rmResourceId = new ResourceID(resourceManagerAddress);
    final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway(resourceManagerId, rmResourceId, resourceManagerAddress, "localhost");
    final CompletableFuture<JobID> disconnectedJobManagerFuture = new CompletableFuture<>();
    final CountDownLatch registrationAttempts = new CountDownLatch(2);
    final Queue<CompletableFuture<RegistrationResponse>> connectionResponses = new ArrayDeque<>(2);
    connectionResponses.add(CompletableFuture.completedFuture(resourceManagerGateway.getJobMasterRegistrationSuccess()));
    connectionResponses.add(new CompletableFuture<>());
    resourceManagerGateway.setRegisterJobManagerFunction((jobMasterId, resourceID, s, jobID) -> {
        registrationAttempts.countDown();
        return connectionResponses.poll();
    });
    resourceManagerGateway.setDisconnectJobManagerConsumer(tuple -> disconnectedJobManagerFuture.complete(tuple.f0));
    resourceManagerGateway.setJobMasterHeartbeatFunction(ignored -> FutureUtils.completedExceptionally(new RecipientUnreachableException("sender", "recipient", "resource manager is unreachable")));
    rpcService.registerGateway(resourceManagerAddress, resourceManagerGateway);
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withJobMasterId(jobMasterId).withResourceId(jmResourceId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
    jobMaster.start();
    try {
        // define a leader and see that a registration happens
        rmLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerId.toUUID());
        final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
        CommonTestUtils.waitUntilCondition(() -> {
            jobMasterGateway.heartbeatFromResourceManager(rmResourceId);
            return disconnectedJobManagerFuture.isDone();
        }, Deadline.fromNow(TimeUtils.toDuration(testingTimeout)), 50L);
        // heartbeat timeout should trigger disconnect JobManager from ResourceManager
        assertThat(disconnectedJobManagerFuture.join(), equalTo(jobGraph.getJobID()));
        // the JobMaster should try to reconnect to the RM
        registrationAttempts.await();
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
    }
}
Also used : CountDownLatch(java.util.concurrent.CountDownLatch) ArrayDeque(java.util.ArrayDeque) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 15 with ResourceManagerId

use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.

the class AbstractFineGrainedSlotManagerITCase method testRequirementDeclaration.

private void testRequirementDeclaration(RequirementDeclarationScenario scenario) throws Exception {
    final ResourceID resourceID = ResourceID.generate();
    final JobID jobId = new JobID();
    final SlotID slotId = SlotID.getDynamicSlotID(resourceID);
    final String targetAddress = "localhost";
    final ResourceRequirements requirements = ResourceRequirements.create(jobId, targetAddress, Collections.singleton(ResourceRequirement.create(DEFAULT_SLOT_RESOURCE_PROFILE, 1)));
    final CompletableFuture<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestFuture = new CompletableFuture<>();
    // accept an incoming slot request
    final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(tuple6 -> {
        requestFuture.complete(tuple6);
        return CompletableFuture.completedFuture(Acknowledge.get());
    }).createTestingTaskExecutorGateway();
    final TaskExecutorConnection taskExecutorConnection = new TaskExecutorConnection(resourceID, taskExecutorGateway);
    new Context() {

        {
            runTest(() -> {
                if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_BEFORE_REQUIREMENT_DECLARATION) {
                    runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
                }
                runInMainThread(() -> getSlotManager().processResourceRequirements(requirements));
                if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_AFTER_REQUIREMENT_DECLARATION) {
                    runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
                }
                assertThat(assertFutureCompleteAndReturn(requestFuture), is(equalTo(Tuple6.of(slotId, jobId, assertFutureCompleteAndReturn(requestFuture).f2, DEFAULT_SLOT_RESOURCE_PROFILE, targetAddress, getResourceManagerId()))));
                final TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(assertFutureCompleteAndReturn(requestFuture).f2).get();
                assertEquals("The slot has not been allocated to the expected allocation id.", assertFutureCompleteAndReturn(requestFuture).f2, slot.getAllocationId());
            });
        }
    };
}
Also used : TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) Arrays(java.util.Arrays) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) Tuple6(org.apache.flink.api.java.tuple.Tuple6) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) CompletableFuture(java.util.concurrent.CompletableFuture) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) ArrayList(java.util.ArrayList) Assert.assertThat(org.junit.Assert.assertThat) FunctionUtils(org.apache.flink.util.function.FunctionUtils) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) Matchers.empty(org.hamcrest.Matchers.empty) Iterator(java.util.Iterator) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) SystemExitTrackingSecurityManager(org.apache.flink.runtime.testutils.SystemExitTrackingSecurityManager) Test(org.junit.Test) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Assert.assertFalse(org.junit.Assert.assertFalse) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.is(org.hamcrest.Matchers.is) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) Collections(java.util.Collections) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) Assert.assertEquals(org.junit.Assert.assertEquals) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) Tuple6(org.apache.flink.api.java.tuple.Tuple6) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)

Aggregations

ResourceManagerId (org.apache.flink.runtime.resourcemanager.ResourceManagerId)19 Test (org.junit.Test)16 CompletableFuture (java.util.concurrent.CompletableFuture)15 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)14 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)12 JobID (org.apache.flink.api.common.JobID)11 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)11 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)10 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)9 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)9 Matchers.empty (org.hamcrest.Matchers.empty)9 Assert.assertThat (org.junit.Assert.assertThat)9 ArrayList (java.util.ArrayList)8 Collection (java.util.Collection)8 List (java.util.List)8 Tuple6 (org.apache.flink.api.java.tuple.Tuple6)8 TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)8 ResourceRequirement (org.apache.flink.runtime.slots.ResourceRequirement)8 TestingTaskExecutorGatewayBuilder (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder)8 Assert.assertFalse (org.junit.Assert.assertFalse)8