Search in sources :

Example 1 with DispatcherOperationCaches

use of org.apache.flink.runtime.dispatcher.DispatcherOperationCaches in project flink by apache.

the class DefaultDispatcherResourceManagerComponentFactory method create.

@Override
public DispatcherResourceManagerComponent create(Configuration configuration, ResourceID resourceId, Executor ioExecutor, RpcService rpcService, HighAvailabilityServices highAvailabilityServices, BlobServer blobServer, HeartbeatServices heartbeatServices, MetricRegistry metricRegistry, ExecutionGraphInfoStore executionGraphInfoStore, MetricQueryServiceRetriever metricQueryServiceRetriever, FatalErrorHandler fatalErrorHandler) throws Exception {
    LeaderRetrievalService dispatcherLeaderRetrievalService = null;
    LeaderRetrievalService resourceManagerRetrievalService = null;
    WebMonitorEndpoint<?> webMonitorEndpoint = null;
    ResourceManagerService resourceManagerService = null;
    DispatcherRunner dispatcherRunner = null;
    try {
        dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();
        final LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(rpcService, DispatcherGateway.class, DispatcherId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(rpcService, ResourceManagerGateway.class, ResourceManagerId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final ScheduledExecutorService executor = WebMonitorEndpoint.createExecutorService(configuration.getInteger(RestOptions.SERVER_NUM_THREADS), configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY), "DispatcherRestEndpoint");
        final long updateInterval = configuration.getLong(MetricOptions.METRIC_FETCHER_UPDATE_INTERVAL);
        final MetricFetcher metricFetcher = updateInterval == 0 ? VoidMetricFetcher.INSTANCE : MetricFetcherImpl.fromConfiguration(configuration, metricQueryServiceRetriever, dispatcherGatewayRetriever, executor);
        webMonitorEndpoint = restEndpointFactory.createRestEndpoint(configuration, dispatcherGatewayRetriever, resourceManagerGatewayRetriever, blobServer, executor, metricFetcher, highAvailabilityServices.getClusterRestEndpointLeaderElectionService(), fatalErrorHandler);
        log.debug("Starting Dispatcher REST endpoint.");
        webMonitorEndpoint.start();
        final String hostname = RpcUtils.getHostname(rpcService);
        resourceManagerService = ResourceManagerServiceImpl.create(resourceManagerFactory, configuration, resourceId, rpcService, highAvailabilityServices, heartbeatServices, fatalErrorHandler, new ClusterInformation(hostname, blobServer.getPort()), webMonitorEndpoint.getRestBaseUrl(), metricRegistry, hostname, ioExecutor);
        final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint, ioExecutor);
        final DispatcherOperationCaches dispatcherOperationCaches = new DispatcherOperationCaches(configuration.get(RestOptions.ASYNC_OPERATION_STORE_DURATION));
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, resourceManagerGatewayRetriever, blobServer, heartbeatServices, () -> JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, hostname), executionGraphInfoStore, fatalErrorHandler, historyServerArchivist, metricRegistry.getMetricQueryServiceGatewayRpcAddress(), ioExecutor, dispatcherOperationCaches);
        log.debug("Starting Dispatcher.");
        dispatcherRunner = dispatcherRunnerFactory.createDispatcherRunner(highAvailabilityServices.getDispatcherLeaderElectionService(), fatalErrorHandler, new HaServicesJobPersistenceComponentFactory(highAvailabilityServices), ioExecutor, rpcService, partialDispatcherServices);
        log.debug("Starting ResourceManagerService.");
        resourceManagerService.start();
        resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);
        dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
        return new DispatcherResourceManagerComponent(dispatcherRunner, resourceManagerService, dispatcherLeaderRetrievalService, resourceManagerRetrievalService, webMonitorEndpoint, fatalErrorHandler, dispatcherOperationCaches);
    } catch (Exception exception) {
        // clean up all started components
        if (dispatcherLeaderRetrievalService != null) {
            try {
                dispatcherLeaderRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        if (resourceManagerRetrievalService != null) {
            try {
                resourceManagerRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        final Collection<CompletableFuture<Void>> terminationFutures = new ArrayList<>(3);
        if (webMonitorEndpoint != null) {
            terminationFutures.add(webMonitorEndpoint.closeAsync());
        }
        if (resourceManagerService != null) {
            terminationFutures.add(resourceManagerService.closeAsync());
        }
        if (dispatcherRunner != null) {
            terminationFutures.add(dispatcherRunner.closeAsync());
        }
        final FutureUtils.ConjunctFuture<Void> terminationFuture = FutureUtils.completeAll(terminationFutures);
        try {
            terminationFuture.get();
        } catch (Exception e) {
            exception = ExceptionUtils.firstOrSuppressed(e, exception);
        }
        throw new FlinkException("Could not create the DispatcherResourceManagerComponent.", exception);
    }
}
Also used : ExponentialBackoffRetryStrategy(org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) RpcGatewayRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever) DispatcherRunner(org.apache.flink.runtime.dispatcher.runner.DispatcherRunner) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) HistoryServerArchivist(org.apache.flink.runtime.dispatcher.HistoryServerArchivist) HaServicesJobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.HaServicesJobPersistenceComponentFactory) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) ResourceManagerService(org.apache.flink.runtime.resourcemanager.ResourceManagerService) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) VoidMetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.VoidMetricFetcher) MetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcher) FlinkException(org.apache.flink.util.FlinkException) FlinkException(org.apache.flink.util.FlinkException) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) Collection(java.util.Collection)

Example 2 with DispatcherOperationCaches

use of org.apache.flink.runtime.dispatcher.DispatcherOperationCaches in project flink by apache.

the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.

/**
 * See FLINK-11665.
 */
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
    final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
    final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
    final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
    try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
        final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
        try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {

            @Override
            public JobGraphStore createJobGraphStore() {
                return createZooKeeperJobGraphStore(client);
            }

            @Override
            public JobResultStore createJobResultStore() {
                return new EmbeddedJobResultStore();
            }
        }, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
            // initial run
            DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            final JobGraph jobGraph = createJobGraphWithBlobs();
            LOG.info("Initial job submission {}.", jobGraph.getJobID());
            dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
            dispatcherLeaderElectionService.notLeader();
            // recovering submitted jobs
            LOG.info("Re-grant leadership first time.");
            dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
            // cancellation of the job should remove everything
            final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
            dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
            // a successful cancellation should eventually remove all job information
            final JobResult jobResult = jobResultFuture.get();
            assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
            dispatcherLeaderElectionService.notLeader();
            // check that the job has been removed from ZooKeeper
            final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
            CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
        }
    }
    // check resource clean up
    assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
Also used : ZooKeeperUtils(org.apache.flink.runtime.util.ZooKeeperUtils) Deadline(org.apache.flink.api.common.time.Deadline) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) Matchers.emptyArray(org.hamcrest.Matchers.emptyArray) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Assert.assertThat(org.junit.Assert.assertThat) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) After(org.junit.After) Duration(java.time.Duration) TestLogger(org.apache.flink.util.TestLogger) ClassRule(org.junit.ClassRule) HighAvailabilityServicesUtils(org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils) UUID(java.util.UUID) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.is(org.hamcrest.Matchers.is) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) Time(org.apache.flink.api.common.time.Time) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) BlobServer(org.apache.flink.runtime.blob.BlobServer) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderConnectionInfo(org.apache.flink.runtime.util.LeaderConnectionInfo) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) VoidHistoryServerArchivist(org.apache.flink.runtime.dispatcher.VoidHistoryServerArchivist) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) PermanentBlobKey(org.apache.flink.runtime.blob.PermanentBlobKey) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) Before(org.junit.Before) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Logger(org.slf4j.Logger) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Test(org.junit.Test) IOException(java.io.IOException) File(java.io.File) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) ForkJoinPool(java.util.concurrent.ForkJoinPool) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcServiceResource(org.apache.flink.runtime.rpc.TestingRpcServiceResource) BlobUtils(org.apache.flink.runtime.blob.BlobUtils) SessionDispatcherFactory(org.apache.flink.runtime.dispatcher.SessionDispatcherFactory) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ZooKeeperResource(org.apache.flink.runtime.zookeeper.ZooKeeperResource) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) TemporaryFolder(org.junit.rules.TemporaryFolder) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) JobResult(org.apache.flink.runtime.jobmaster.JobResult) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) Test(org.junit.Test)

Aggregations

DispatcherGateway (org.apache.flink.runtime.dispatcher.DispatcherGateway)2 DispatcherId (org.apache.flink.runtime.dispatcher.DispatcherId)2 DispatcherOperationCaches (org.apache.flink.runtime.dispatcher.DispatcherOperationCaches)2 PartialDispatcherServices (org.apache.flink.runtime.dispatcher.PartialDispatcherServices)2 File (java.io.File)1 IOException (java.io.IOException)1 Duration (java.time.Duration)1 Collection (java.util.Collection)1 UUID (java.util.UUID)1 CompletableFuture (java.util.concurrent.CompletableFuture)1 ForkJoinPool (java.util.concurrent.ForkJoinPool)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 Deadline (org.apache.flink.api.common.time.Deadline)1 Time (org.apache.flink.api.common.time.Time)1 Configuration (org.apache.flink.configuration.Configuration)1 HighAvailabilityOptions (org.apache.flink.configuration.HighAvailabilityOptions)1 BlobServer (org.apache.flink.runtime.blob.BlobServer)1 BlobUtils (org.apache.flink.runtime.blob.BlobUtils)1 PermanentBlobKey (org.apache.flink.runtime.blob.PermanentBlobKey)1 ApplicationStatus (org.apache.flink.runtime.clusterframework.ApplicationStatus)1