Search in sources :

Example 1 with ExponentialBackoffRetryStrategy

use of org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy in project flink by apache.

the class DefaultDispatcherResourceManagerComponentFactory method create.

@Override
public DispatcherResourceManagerComponent create(Configuration configuration, ResourceID resourceId, Executor ioExecutor, RpcService rpcService, HighAvailabilityServices highAvailabilityServices, BlobServer blobServer, HeartbeatServices heartbeatServices, MetricRegistry metricRegistry, ExecutionGraphInfoStore executionGraphInfoStore, MetricQueryServiceRetriever metricQueryServiceRetriever, FatalErrorHandler fatalErrorHandler) throws Exception {
    LeaderRetrievalService dispatcherLeaderRetrievalService = null;
    LeaderRetrievalService resourceManagerRetrievalService = null;
    WebMonitorEndpoint<?> webMonitorEndpoint = null;
    ResourceManagerService resourceManagerService = null;
    DispatcherRunner dispatcherRunner = null;
    try {
        dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();
        final LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(rpcService, DispatcherGateway.class, DispatcherId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(rpcService, ResourceManagerGateway.class, ResourceManagerId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
        final ScheduledExecutorService executor = WebMonitorEndpoint.createExecutorService(configuration.getInteger(RestOptions.SERVER_NUM_THREADS), configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY), "DispatcherRestEndpoint");
        final long updateInterval = configuration.getLong(MetricOptions.METRIC_FETCHER_UPDATE_INTERVAL);
        final MetricFetcher metricFetcher = updateInterval == 0 ? VoidMetricFetcher.INSTANCE : MetricFetcherImpl.fromConfiguration(configuration, metricQueryServiceRetriever, dispatcherGatewayRetriever, executor);
        webMonitorEndpoint = restEndpointFactory.createRestEndpoint(configuration, dispatcherGatewayRetriever, resourceManagerGatewayRetriever, blobServer, executor, metricFetcher, highAvailabilityServices.getClusterRestEndpointLeaderElectionService(), fatalErrorHandler);
        log.debug("Starting Dispatcher REST endpoint.");
        webMonitorEndpoint.start();
        final String hostname = RpcUtils.getHostname(rpcService);
        resourceManagerService = ResourceManagerServiceImpl.create(resourceManagerFactory, configuration, resourceId, rpcService, highAvailabilityServices, heartbeatServices, fatalErrorHandler, new ClusterInformation(hostname, blobServer.getPort()), webMonitorEndpoint.getRestBaseUrl(), metricRegistry, hostname, ioExecutor);
        final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint, ioExecutor);
        final DispatcherOperationCaches dispatcherOperationCaches = new DispatcherOperationCaches(configuration.get(RestOptions.ASYNC_OPERATION_STORE_DURATION));
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, resourceManagerGatewayRetriever, blobServer, heartbeatServices, () -> JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, hostname), executionGraphInfoStore, fatalErrorHandler, historyServerArchivist, metricRegistry.getMetricQueryServiceGatewayRpcAddress(), ioExecutor, dispatcherOperationCaches);
        log.debug("Starting Dispatcher.");
        dispatcherRunner = dispatcherRunnerFactory.createDispatcherRunner(highAvailabilityServices.getDispatcherLeaderElectionService(), fatalErrorHandler, new HaServicesJobPersistenceComponentFactory(highAvailabilityServices), ioExecutor, rpcService, partialDispatcherServices);
        log.debug("Starting ResourceManagerService.");
        resourceManagerService.start();
        resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);
        dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
        return new DispatcherResourceManagerComponent(dispatcherRunner, resourceManagerService, dispatcherLeaderRetrievalService, resourceManagerRetrievalService, webMonitorEndpoint, fatalErrorHandler, dispatcherOperationCaches);
    } catch (Exception exception) {
        // clean up all started components
        if (dispatcherLeaderRetrievalService != null) {
            try {
                dispatcherLeaderRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        if (resourceManagerRetrievalService != null) {
            try {
                resourceManagerRetrievalService.stop();
            } catch (Exception e) {
                exception = ExceptionUtils.firstOrSuppressed(e, exception);
            }
        }
        final Collection<CompletableFuture<Void>> terminationFutures = new ArrayList<>(3);
        if (webMonitorEndpoint != null) {
            terminationFutures.add(webMonitorEndpoint.closeAsync());
        }
        if (resourceManagerService != null) {
            terminationFutures.add(resourceManagerService.closeAsync());
        }
        if (dispatcherRunner != null) {
            terminationFutures.add(dispatcherRunner.closeAsync());
        }
        final FutureUtils.ConjunctFuture<Void> terminationFuture = FutureUtils.completeAll(terminationFutures);
        try {
            terminationFuture.get();
        } catch (Exception e) {
            exception = ExceptionUtils.firstOrSuppressed(e, exception);
        }
        throw new FlinkException("Could not create the DispatcherResourceManagerComponent.", exception);
    }
}
Also used : ExponentialBackoffRetryStrategy(org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) RpcGatewayRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever) DispatcherRunner(org.apache.flink.runtime.dispatcher.runner.DispatcherRunner) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) HistoryServerArchivist(org.apache.flink.runtime.dispatcher.HistoryServerArchivist) HaServicesJobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.HaServicesJobPersistenceComponentFactory) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) ResourceManagerService(org.apache.flink.runtime.resourcemanager.ResourceManagerService) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) VoidMetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.VoidMetricFetcher) MetricFetcher(org.apache.flink.runtime.rest.handler.legacy.metrics.MetricFetcher) FlinkException(org.apache.flink.util.FlinkException) FlinkException(org.apache.flink.util.FlinkException) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) Collection(java.util.Collection)

Example 2 with ExponentialBackoffRetryStrategy

use of org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy in project flink by apache.

the class MiniCluster method start.

/**
 * Starts the mini cluster, based on the configured properties.
 *
 * @throws Exception This method passes on any exception that occurs during the startup of the
 *     mini cluster.
 */
public void start() throws Exception {
    synchronized (lock) {
        checkState(!running, "MiniCluster is already running");
        LOG.info("Starting Flink Mini Cluster");
        LOG.debug("Using configuration {}", miniClusterConfiguration);
        final Configuration configuration = miniClusterConfiguration.getConfiguration();
        final boolean useSingleRpcService = miniClusterConfiguration.getRpcServiceSharing() == RpcServiceSharing.SHARED;
        try {
            workingDirectory = WorkingDirectory.create(ClusterEntrypointUtils.generateWorkingDirectoryFile(configuration, Optional.empty(), "minicluster_" + ResourceID.generate()));
            initializeIOFormatClasses(configuration);
            rpcSystem = rpcSystemSupplier.get();
            LOG.info("Starting Metrics Registry");
            metricRegistry = createMetricRegistry(configuration, rpcSystem.deref().getMaximumMessageSizeInBytes(configuration));
            // bring up all the RPC services
            LOG.info("Starting RPC Service(s)");
            final RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory;
            final RpcService metricQueryServiceRpcService;
            if (useSingleRpcService) {
                // we always need the 'commonRpcService' for auxiliary calls
                commonRpcService = createLocalRpcService(configuration, rpcSystem.deref());
                final CommonRpcServiceFactory commonRpcServiceFactory = new CommonRpcServiceFactory(commonRpcService);
                taskManagerRpcServiceFactory = commonRpcServiceFactory;
                dispatcherResourceManagerComponentRpcServiceFactory = commonRpcServiceFactory;
                metricQueryServiceRpcService = MetricUtils.startLocalMetricsRpcService(configuration, rpcSystem.deref());
            } else {
                // start a new service per component, possibly with custom bind addresses
                final String jobManagerExternalAddress = miniClusterConfiguration.getJobManagerExternalAddress();
                final String taskManagerExternalAddress = miniClusterConfiguration.getTaskManagerExternalAddress();
                final String jobManagerExternalPortRange = miniClusterConfiguration.getJobManagerExternalPortRange();
                final String taskManagerExternalPortRange = miniClusterConfiguration.getTaskManagerExternalPortRange();
                final String jobManagerBindAddress = miniClusterConfiguration.getJobManagerBindAddress();
                final String taskManagerBindAddress = miniClusterConfiguration.getTaskManagerBindAddress();
                dispatcherResourceManagerComponentRpcServiceFactory = new DedicatedRpcServiceFactory(configuration, jobManagerExternalAddress, jobManagerExternalPortRange, jobManagerBindAddress, rpcSystem.deref());
                taskManagerRpcServiceFactory = new DedicatedRpcServiceFactory(configuration, taskManagerExternalAddress, taskManagerExternalPortRange, taskManagerBindAddress, rpcSystem.deref());
                // we always need the 'commonRpcService' for auxiliary calls
                // bind to the JobManager address with port 0
                commonRpcService = createRemoteRpcService(configuration, jobManagerBindAddress, 0, rpcSystem.deref());
                metricQueryServiceRpcService = MetricUtils.startRemoteMetricsRpcService(configuration, commonRpcService.getAddress(), null, rpcSystem.deref());
            }
            metricRegistry.startQueryService(metricQueryServiceRpcService, null);
            processMetricGroup = MetricUtils.instantiateProcessMetricGroup(metricRegistry, RpcUtils.getHostname(commonRpcService), ConfigurationUtils.getSystemResourceMetricsProbingInterval(configuration));
            ioExecutor = Executors.newFixedThreadPool(ClusterEntrypointUtils.getPoolSize(configuration), new ExecutorThreadFactory("mini-cluster-io"));
            haServices = createHighAvailabilityServices(configuration, ioExecutor);
            blobServer = BlobUtils.createBlobServer(configuration, Reference.borrowed(workingDirectory.getBlobStorageDirectory()), haServices.createBlobStore());
            blobServer.start();
            heartbeatServices = HeartbeatServices.fromConfiguration(configuration);
            blobCacheService = BlobUtils.createBlobCacheService(configuration, Reference.borrowed(workingDirectory.getBlobStorageDirectory()), haServices.createBlobStore(), new InetSocketAddress(InetAddress.getLocalHost(), blobServer.getPort()));
            startTaskManagers();
            MetricQueryServiceRetriever metricQueryServiceRetriever = new RpcMetricQueryServiceRetriever(metricRegistry.getMetricQueryServiceRpcService());
            setupDispatcherResourceManagerComponents(configuration, dispatcherResourceManagerComponentRpcServiceFactory, metricQueryServiceRetriever);
            resourceManagerLeaderRetriever = haServices.getResourceManagerLeaderRetriever();
            dispatcherLeaderRetriever = haServices.getDispatcherLeaderRetriever();
            clusterRestEndpointLeaderRetrievalService = haServices.getClusterRestEndpointLeaderRetriever();
            dispatcherGatewayRetriever = new RpcGatewayRetriever<>(commonRpcService, DispatcherGateway.class, DispatcherId::fromUuid, new ExponentialBackoffRetryStrategy(21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
            resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(commonRpcService, ResourceManagerGateway.class, ResourceManagerId::fromUuid, new ExponentialBackoffRetryStrategy(21, Duration.ofMillis(5L), Duration.ofMillis(20L)));
            webMonitorLeaderRetriever = new LeaderRetriever();
            resourceManagerLeaderRetriever.start(resourceManagerGatewayRetriever);
            dispatcherLeaderRetriever.start(dispatcherGatewayRetriever);
            clusterRestEndpointLeaderRetrievalService.start(webMonitorLeaderRetriever);
        } catch (Exception e) {
            // cleanup everything
            try {
                close();
            } catch (Exception ee) {
                e.addSuppressed(ee);
            }
            throw e;
        }
        // create a new termination future
        terminationFuture = new CompletableFuture<>();
        // now officially mark this as running
        running = true;
        LOG.info("Flink Mini Cluster started successfully");
    }
}
Also used : RpcMetricQueryServiceRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever) MetricQueryServiceRetriever(org.apache.flink.runtime.webmonitor.retriever.MetricQueryServiceRetriever) ExponentialBackoffRetryStrategy(org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) InetSocketAddress(java.net.InetSocketAddress) RpcMetricQueryServiceRetriever(org.apache.flink.runtime.webmonitor.retriever.impl.RpcMetricQueryServiceRetriever) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) FlinkException(org.apache.flink.util.FlinkException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) CompletionException(java.util.concurrent.CompletionException) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) ExecutorThreadFactory(org.apache.flink.util.concurrent.ExecutorThreadFactory) RpcService(org.apache.flink.runtime.rpc.RpcService) LeaderRetriever(org.apache.flink.runtime.webmonitor.retriever.LeaderRetriever)

Aggregations

DispatcherGateway (org.apache.flink.runtime.dispatcher.DispatcherGateway)2 ResourceManagerGateway (org.apache.flink.runtime.resourcemanager.ResourceManagerGateway)2 FlinkException (org.apache.flink.util.FlinkException)2 ExponentialBackoffRetryStrategy (org.apache.flink.util.concurrent.ExponentialBackoffRetryStrategy)2 IOException (java.io.IOException)1 InetSocketAddress (java.net.InetSocketAddress)1 Collection (java.util.Collection)1 CompletionException (java.util.concurrent.CompletionException)1 ExecutionException (java.util.concurrent.ExecutionException)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 Configuration (org.apache.flink.configuration.Configuration)1 IllegalConfigurationException (org.apache.flink.configuration.IllegalConfigurationException)1 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)1 DispatcherId (org.apache.flink.runtime.dispatcher.DispatcherId)1 DispatcherOperationCaches (org.apache.flink.runtime.dispatcher.DispatcherOperationCaches)1 HistoryServerArchivist (org.apache.flink.runtime.dispatcher.HistoryServerArchivist)1 PartialDispatcherServices (org.apache.flink.runtime.dispatcher.PartialDispatcherServices)1 DispatcherRunner (org.apache.flink.runtime.dispatcher.runner.DispatcherRunner)1 ClusterInformation (org.apache.flink.runtime.entrypoint.ClusterInformation)1 HaServicesJobPersistenceComponentFactory (org.apache.flink.runtime.jobmanager.HaServicesJobPersistenceComponentFactory)1