Search in sources :

Example 1 with ClusterInformation

use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.

the class DefaultDispatcherResourceManagerComponentFactory method create.

@Override
public DispatcherResourceManagerComponent create(Configuration configuration, ResourceID resourceId, Executor ioExecutor, RpcService rpcService, HighAvailabilityServices highAvailabilityServices, BlobServer blobServer, HeartbeatServices heartbeatServices, MetricRegistry metricRegistry, ExecutionGraphInfoStore executionGraphInfoStore, MetricQueryServiceRetriever metricQueryServiceRetriever, FatalErrorHandler fatalErrorHandler) throws Exception {
    LeaderRetrievalService dispatcherLeaderRetrievalService = null;
    LeaderRetrievalService resourceManagerRetrievalService = null;
    WebMonitorEndpoint<?> webMonitorEndpoint = null;
    ResourceManagerService resourceManagerService = null;
    DispatcherRunner dispatcherRunner = null;
    try {
        dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();
        final LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(rpcService, DispatcherGateway.class, DispatcherId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(rpcService, ResourceManagerGateway.class, ResourceManagerId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final ScheduledExecutorService executor = WebMonitorEndpoint.createExecutorService(configuration.getInteger(RestOptions.SERVER_NUM_THREADS), configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY), "DispatcherRestEndpoint");
        final long updateInterval = configuration.getLong(MetricOptions.METRIC_FETCHER_UPDATE_INTERVAL);
        final MetricFetcher metricFetcher = updateInterval == 0 ? VoidMetricFetcher.INSTANCE : MetricFetcherImpl.fromConfiguration(configuration, metricQueryServiceRetriever, dispatcherGatewayRetriever, executor);
        webMonitorEndpoint = restEndpointFactory.createRestEndpoint(configuration, dispatcherGatewayRetriever, resourceManagerGatewayRetriever, blobServer, executor, metricFetcher, highAvailabilityServices.getClusterRestEndpointLeaderElectionService(), fatalErrorHandler);
        log.debug("Starting Dispatcher REST endpoint.");
        webMonitorEndpoint.start();
        final String hostname = RpcUtils.getHostname(rpcService);
        resourceManagerService = ResourceManagerServiceImpl.create(resourceManagerFactory, configuration, resourceId, rpcService, highAvailabilityServices, heartbeatServices, fatalErrorHandler, new ClusterInformation(hostname, blobServer.getPort()), webMonitorEndpoint.getRestBaseUrl(), metricRegistry, hostname, ioExecutor);
        final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint, ioExecutor);
        final DispatcherOperationCaches dispatcherOperationCaches = new DispatcherOperationCaches(configuration.get(RestOptions.ASYNC_OPERATION_STORE_DURATION));
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, resourceManagerGatewayRetriever, blobServer, heartbeatServices, () -> JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, hostname), executionGraphInfoStore, fatalErrorHandler, historyServerArchivist, metricRegistry.getMetricQueryServiceGatewayRpcAddress(), ioExecutor, dispatcherOperationCaches);
        log.debug("Starting Dispatcher.");
        dispatcherRunner = dispatcherRunnerFactory.createDispatcherRunner(highAvailabilityServices.getDispatcherLeaderElectionService(), fatalErrorHandler, new HaServicesJobPersistenceComponentFactory(highAvailabilityServices), ioExecutor, rpcService, partialDispatcherServices);
        log.debug("Starting ResourceManagerService.");
        resourceManagerService.start();
        resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);
        dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
        return new DispatcherResourceManagerComponent(dispatcherRunner, resourceManagerService, dispatcherLeaderRetrievalService, resourceManagerRetrievalService, webMonitorEndpoint, fatalErrorHandler, dispatcherOperationCaches);
    } catch (Exception exception) {
        // clean up all started components
        if (dispatcherLeaderRetrievalService != null) {
            try {
                dispatcherLeaderRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        if (resourceManagerRetrievalService != null) {
            try {
                resourceManagerRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        final Collection<CompletableFuture<Void>> terminationFutures = new ArrayList<>(3);
        if (webMonitorEndpoint != null) {
            terminationFutures.add(webMonitorEndpoint.closeAsync());
        }
        if (resourceManagerService != null) {
            terminationFutures.add(resourceManagerService.closeAsync());
        }
        if (dispatcherRunner != null) {
            terminationFutures.add(dispatcherRunner.closeAsync());
        }
        final FutureUtils.ConjunctFuture<Void> terminationFuture = FutureUtils.completeAll(terminationFutures);
        try {
            terminationFuture.get();
        } catch (Exception e) {
            exception = ExceptionUtils.firstOrSuppressed(e, exception);
        }
        throw new FlinkException("Could not create the DispatcherResourceManagerComponent.", exception);
    }
}
Also used : ExponentialBackoffRetryStrategy(org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) RpcGatewayRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever) DispatcherRunner(org.apache.flink.runtime.dispatcher.runner.DispatcherRunner) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) HistoryServerArchivist(org.apache.flink.runtime.dispatcher.HistoryServerArchivist) HaServicesJobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.HaServicesJobPersistenceComponentFactory) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) ResourceManagerService(org.apache.flink.runtime.resourcemanager.ResourceManagerService) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) VoidMetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.VoidMetricFetcher) MetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcher) FlinkException(org.apache.flink.util.FlinkException) FlinkException(org.apache.flink.util.FlinkException) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) Collection(java.util.Collection)

Example 2 with ClusterInformation

use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.

the class StandaloneResourceManagerTest method createResourceManager.

private TestingStandaloneResourceManager createResourceManager(Time startupPeriod, SlotManager slotManager) throws Exception {
    final MockResourceManagerRuntimeServices rmServices = new MockResourceManagerRuntimeServices(RPC_SERVICE.getTestingRpcService(), TIMEOUT, slotManager);
    final TestingStandaloneResourceManager rm = new TestingStandaloneResourceManager(rmServices.rpcService, UUID.randomUUID(), ResourceID.generate(), rmServices.heartbeatServices, rmServices.slotManager, rmServices.jobLeaderIdService, new ClusterInformation("localhost", 1234), fatalErrorHandler, UnregisteredMetricGroups.createUnregisteredResourceManagerMetricGroup(), startupPeriod);
    rm.start();
    rm.getStartedFuture().get(TIMEOUT.getSize(), TIMEOUT.getUnit());
    return rm;
}
Also used : MockResourceManagerRuntimeServices(org.apache.flink.runtime.resourcemanager.utils.MockResourceManagerRuntimeServices) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation)

Example 3 with ClusterInformation

use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.

the class TaskExecutorPartitionLifecycleTest method internalTestPartitionRelease.

private void internalTestPartitionRelease(TaskExecutorPartitionTracker partitionTracker, ShuffleEnvironment<?, ?> shuffleEnvironment, CompletableFuture<ResultPartitionID> startTrackingFuture, TestAction testAction) throws Exception {
    final ResultPartitionDeploymentDescriptor taskResultPartitionDescriptor = PartitionTestUtils.createPartitionDeploymentDescriptor(ResultPartitionType.BLOCKING);
    final ExecutionAttemptID eid1 = taskResultPartitionDescriptor.getShuffleDescriptor().getResultPartitionID().getProducerId();
    final TaskDeploymentDescriptor taskDeploymentDescriptor = TaskExecutorSubmissionTest.createTaskDeploymentDescriptor(jobId, "job", eid1, new SerializedValue<>(new ExecutionConfig()), "Sender", 1, 0, 1, 0, new Configuration(), new Configuration(), TestingInvokable.class.getName(), Collections.singletonList(taskResultPartitionDescriptor), Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
    final TaskSlotTable<Task> taskSlotTable = createTaskSlotTable();
    final TaskExecutorLocalStateStoresManager localStateStoresManager = new TaskExecutorLocalStateStoresManager(false, Reference.owned(new File[] { tmp.newFolder() }), Executors.directExecutor());
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).setShuffleEnvironment(shuffleEnvironment).build();
    final CompletableFuture<Void> taskFinishedFuture = new CompletableFuture<>();
    final OneShotLatch slotOfferedLatch = new OneShotLatch();
    final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> CompletableFuture.completedFuture(new JMTMRegistrationSuccess(ResourceID.generate()))).setOfferSlotsFunction((resourceID, slotOffers) -> {
        slotOfferedLatch.trigger();
        return CompletableFuture.completedFuture(slotOffers);
    }).setUpdateTaskExecutionStateFunction(taskExecutionState -> {
        if (taskExecutionState.getExecutionState() == ExecutionState.FINISHED) {
            taskFinishedFuture.complete(null);
        }
        return CompletableFuture.completedFuture(Acknowledge.get());
    }).build();
    final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices, partitionTracker);
    final CompletableFuture<SlotReport> initialSlotReportFuture = new CompletableFuture<>();
    final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
    testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
        initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    testingResourceManagerGateway.setRegisterTaskExecutorFunction(input -> CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("blobServerHost", 55555))));
    try {
        taskExecutor.start();
        taskExecutor.waitUntilStarted();
        final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
        final String jobMasterAddress = "jm";
        rpc.registerGateway(jobMasterAddress, jobMasterGateway);
        rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
        // inform the task manager about the job leader
        taskManagerServices.getJobLeaderService().addJob(jobId, jobMasterAddress);
        jobManagerLeaderRetriever.notifyListener(jobMasterAddress, UUID.randomUUID());
        resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
        final Optional<SlotStatus> slotStatusOptional = StreamSupport.stream(initialSlotReportFuture.get().spliterator(), false).findAny();
        assertTrue(slotStatusOptional.isPresent());
        final SlotStatus slotStatus = slotStatusOptional.get();
        while (true) {
            try {
                taskExecutorGateway.requestSlot(slotStatus.getSlotID(), jobId, taskDeploymentDescriptor.getAllocationId(), ResourceProfile.ZERO, jobMasterAddress, testingResourceManagerGateway.getFencingToken(), timeout).get();
                break;
            } catch (Exception e) {
                // the proper establishment of the RM connection is tracked
                // asynchronously, so we have to poll here until it went through
                // until then, slot requests will fail with an exception
                Thread.sleep(50);
            }
        }
        TestingInvokable.sync = new BlockerSync();
        // Wait till the slot has been successfully offered before submitting the task.
        // This ensures TM has been successfully registered to JM.
        slotOfferedLatch.await();
        taskExecutorGateway.submitTask(taskDeploymentDescriptor, jobMasterGateway.getFencingToken(), timeout).get();
        TestingInvokable.sync.awaitBlocker();
        // the task is still running => the partition is in in-progress and should be tracked
        assertThat(startTrackingFuture.get(), equalTo(taskResultPartitionDescriptor.getShuffleDescriptor().getResultPartitionID()));
        TestingInvokable.sync.releaseBlocker();
        taskFinishedFuture.get(timeout.getSize(), timeout.getUnit());
        testAction.accept(jobId, taskResultPartitionDescriptor, taskExecutor, taskExecutorGateway);
    } finally {
        RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
    }
    // the shutdown of the backing shuffle environment releases all partitions
    // the book-keeping is not aware of this
    assertTrue(shuffleEnvironment.getPartitionsOccupyingLocalResources().isEmpty());
}
Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) ShuffleEnvironment(org.apache.flink.runtime.shuffle.ShuffleEnvironment) InetAddress(java.net.InetAddress) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) TaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTracker) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) ClassRule(org.junit.ClassRule) AfterClass(org.junit.AfterClass) Collection(java.util.Collection) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) NoOpTaskExecutorBlobService(org.apache.flink.runtime.blob.NoOpTaskExecutorBlobService) TestingTaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingTaskExecutorPartitionTracker) UUID(java.util.UUID) NettyShuffleEnvironment(org.apache.flink.runtime.io.network.NettyShuffleEnvironment) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) TaskExecutorPartitionInfo(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionInfo) SerializedValue(org.apache.flink.util.SerializedValue) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Optional(java.util.Optional) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) Time(org.apache.flink.api.common.time.Time) Environment(org.apache.flink.runtime.execution.Environment) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) TaskExecutorPartitionTrackerImpl(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTrackerImpl) NoOpTaskManagerActions(org.apache.flink.runtime.taskmanager.NoOpTaskManagerActions) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) BeforeClass(org.junit.BeforeClass) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) CompletableFuture(java.util.concurrent.CompletableFuture) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) BlockerSync(org.apache.flink.core.testutils.BlockerSync) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) ExternalResourceInfoProvider(org.apache.flink.runtime.externalresource.ExternalResourceInfoProvider) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) TriConsumer(org.apache.flink.util.function.TriConsumer) StreamSupport(java.util.stream.StreamSupport) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestFileUtils(org.apache.flink.testutils.TestFileUtils) Before(org.junit.Before) TaskSlotUtils(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils) TestExecutorResource(org.apache.flink.testutils.executor.TestExecutorResource) NettyShuffleEnvironmentBuilder(org.apache.flink.runtime.io.network.NettyShuffleEnvironmentBuilder) CoreMatchers.hasItems(org.hamcrest.CoreMatchers.hasItems) Configuration(org.apache.flink.configuration.Configuration) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) InstanceID(org.apache.flink.runtime.instance.InstanceID) Reference(org.apache.flink.util.Reference) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) Executors(org.apache.flink.util.concurrent.Executors) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) Task(org.apache.flink.runtime.taskmanager.Task) Rule(org.junit.Rule) PartitionTestUtils(org.apache.flink.runtime.io.network.partition.PartitionTestUtils) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) Task(org.apache.flink.runtime.taskmanager.Task) ResultPartitionDeploymentDescriptor(org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor) Configuration(org.apache.flink.configuration.Configuration) InstanceID(org.apache.flink.runtime.instance.InstanceID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) CompletableFuture(java.util.concurrent.CompletableFuture) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) BlockerSync(org.apache.flink.core.testutils.BlockerSync) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) File(java.io.File)

Example 4 with ClusterInformation

use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.

the class TaskExecutorExecutionDeploymentReconciliationTest method setupResourceManagerGateway.

private static TestingResourceManagerGateway setupResourceManagerGateway(CompletableFuture<SlotReport> initialSlotReportFuture) {
    final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
    testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
        initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    testingResourceManagerGateway.setRegisterTaskExecutorFunction(input -> CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("blobServerHost", 55555))));
    return testingResourceManagerGateway;
}
Also used : InstanceID(org.apache.flink.runtime.instance.InstanceID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation)

Example 5 with ClusterInformation

use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.

the class TaskExecutorTest method testHeartbeatReporting.

/**
 * Tests that the correct partition/slot report is sent as part of the heartbeat response.
 */
@Test
public void testHeartbeatReporting() throws Exception {
    final String rmAddress = "rm";
    final UUID rmLeaderId = UUID.randomUUID();
    // register the mock resource manager gateway
    final TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway();
    final CompletableFuture<ResourceID> taskExecutorRegistrationFuture = new CompletableFuture<>();
    final ResourceID rmResourceId = rmGateway.getOwnResourceId();
    final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), rmResourceId, new ClusterInformation("localhost", 1234)));
    rmGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
        taskExecutorRegistrationFuture.complete(taskExecutorRegistration.getResourceId());
        return registrationResponse;
    });
    final CompletableFuture<SlotReport> initialSlotReportFuture = new CompletableFuture<>();
    rmGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
        initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
        return CompletableFuture.completedFuture(Acknowledge.get());
    });
    final CompletableFuture<TaskExecutorHeartbeatPayload> heartbeatPayloadCompletableFuture = new CompletableFuture<>();
    rmGateway.setTaskExecutorHeartbeatFunction((resourceID, heartbeatPayload) -> {
        heartbeatPayloadCompletableFuture.complete(heartbeatPayload);
        return FutureUtils.completedVoidFuture();
    });
    rpc.registerGateway(rmAddress, rmGateway);
    final SlotID slotId = buildSlotID(0);
    final ResourceProfile resourceProfile = ResourceProfile.fromResources(1.0, 1);
    final SlotReport slotReport1 = new SlotReport(new SlotStatus(slotId, resourceProfile));
    final SlotReport slotReport2 = new SlotReport(new SlotStatus(slotId, resourceProfile, new JobID(), new AllocationID()));
    final Queue<SlotReport> reports = new ArrayDeque<>(Arrays.asList(slotReport1, slotReport2));
    final TaskSlotTable<Task> taskSlotTable = TestingTaskSlotTable.<Task>newBuilder().createSlotReportSupplier(reports::poll).closeAsyncReturns(CompletableFuture.completedFuture(null)).build();
    final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
    final TaskExecutorPartitionTracker partitionTracker = createPartitionTrackerWithFixedPartitionReport(taskManagerServices.getShuffleEnvironment());
    final TaskExecutor taskManager = createTaskExecutor(taskManagerServices, HEARTBEAT_SERVICES, partitionTracker);
    try {
        taskManager.start();
        // define a leader and see that a registration happens
        resourceManagerLeaderRetriever.notifyListener(rmAddress, rmLeaderId);
        // register resource manager success will trigger monitoring heartbeat target between tm
        // and rm
        assertThat(taskExecutorRegistrationFuture.get(), equalTo(unresolvedTaskManagerLocation.getResourceID()));
        assertThat(initialSlotReportFuture.get(), equalTo(slotReport1));
        TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
        // trigger the heartbeat asynchronously
        taskExecutorGateway.heartbeatFromResourceManager(rmResourceId);
        // wait for heartbeat response
        SlotReport actualSlotReport = heartbeatPayloadCompletableFuture.get().getSlotReport();
        // the new slot report should be reported
        assertEquals(slotReport2, actualSlotReport);
        ClusterPartitionReport actualClusterPartitionReport = heartbeatPayloadCompletableFuture.get().getClusterPartitionReport();
        assertEquals(partitionTracker.createClusterPartitionReport(), actualClusterPartitionReport);
    } finally {
        RpcUtils.terminateRpcEndpoint(taskManager, timeout);
    }
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) InstanceID(org.apache.flink.runtime.instance.InstanceID) Matchers.containsString(org.hamcrest.Matchers.containsString) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TaskSlotUtils.createTotalResourceProfile(org.apache.flink.runtime.taskexecutor.slot.TaskSlotUtils.createTotalResourceProfile) TaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TaskExecutorPartitionTracker) TestingTaskExecutorPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingTaskExecutorPartitionTracker) AllocatedSlotReport(org.apache.flink.runtime.jobmaster.AllocatedSlotReport) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) ArrayDeque(java.util.ArrayDeque) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ClusterPartitionReport(org.apache.flink.runtime.taskexecutor.partition.ClusterPartitionReport) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

ClusterInformation (org.apache.flink.runtime.entrypoint.ClusterInformation)12 TestingResourceManagerGateway (org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway)10 InstanceID (org.apache.flink.runtime.instance.InstanceID)9 CompletableFuture (java.util.concurrent.CompletableFuture)7 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)7 Test (org.junit.Test)7 Task (org.apache.flink.runtime.taskmanager.Task)6 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)5 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)5 FlinkException (org.apache.flink.util.FlinkException)5 ArrayDeque (java.util.ArrayDeque)4 Collection (java.util.Collection)4 UUID (java.util.UUID)4 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)4 CountDownLatch (java.util.concurrent.CountDownLatch)4 JobID (org.apache.flink.api.common.JobID)4 File (java.io.File)3 IOException (java.io.IOException)3 InetAddress (java.net.InetAddress)3 Collections (java.util.Collections)3