Search in sources :

Example 21 with DispatcherGateway

use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.

the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.

/**
 * See FLINK-11665.
 */
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
    final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
    final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
    final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
    try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
        final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
        final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
        try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {

            @Override
            public JobGraphStore createJobGraphStore() {
                return createZooKeeperJobGraphStore(client);
            }

            @Override
            public JobResultStore createJobResultStore() {
                return new EmbeddedJobResultStore();
            }
        }, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
            // initial run
            DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            final JobGraph jobGraph = createJobGraphWithBlobs();
            LOG.info("Initial job submission {}.", jobGraph.getJobID());
            dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
            dispatcherLeaderElectionService.notLeader();
            // recovering submitted jobs
            LOG.info("Re-grant leadership first time.");
            dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
            LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
            // cancellation of the job should remove everything
            final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
            dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
            // a successful cancellation should eventually remove all job information
            final JobResult jobResult = jobResultFuture.get();
            assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
            dispatcherLeaderElectionService.notLeader();
            // check that the job has been removed from ZooKeeper
            final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
            CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
        }
    }
    // check resource clean up
    assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
Also used : ZooKeeperUtils(org.apache.flink.runtime.util.ZooKeeperUtils) Deadline(org.apache.flink.api.common.time.Deadline) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) Matchers.emptyArray(org.hamcrest.Matchers.emptyArray) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Assert.assertThat(org.junit.Assert.assertThat) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) After(org.junit.After) Duration(java.time.Duration) TestLogger(org.apache.flink.util.TestLogger) ClassRule(org.junit.ClassRule) HighAvailabilityServicesUtils(org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils) UUID(java.util.UUID) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.is(org.hamcrest.Matchers.is) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) Time(org.apache.flink.api.common.time.Time) DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) BlobServer(org.apache.flink.runtime.blob.BlobServer) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderConnectionInfo(org.apache.flink.runtime.util.LeaderConnectionInfo) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) VoidHistoryServerArchivist(org.apache.flink.runtime.dispatcher.VoidHistoryServerArchivist) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) PermanentBlobKey(org.apache.flink.runtime.blob.PermanentBlobKey) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) Before(org.junit.Before) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Logger(org.slf4j.Logger) ApplicationStatus(org.apache.flink.runtime.clusterframework.ApplicationStatus) Configuration(org.apache.flink.configuration.Configuration) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) Test(org.junit.Test) IOException(java.io.IOException) File(java.io.File) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) ForkJoinPool(java.util.concurrent.ForkJoinPool) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcServiceResource(org.apache.flink.runtime.rpc.TestingRpcServiceResource) BlobUtils(org.apache.flink.runtime.blob.BlobUtils) SessionDispatcherFactory(org.apache.flink.runtime.dispatcher.SessionDispatcherFactory) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ZooKeeperResource(org.apache.flink.runtime.zookeeper.ZooKeeperResource) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) TemporaryFolder(org.junit.rules.TemporaryFolder) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) JobResult(org.apache.flink.runtime.jobmaster.JobResult) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) TestingHighAvailabilityServicesBuilder(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) JobPersistenceComponentFactory(org.apache.flink.runtime.jobmanager.JobPersistenceComponentFactory) CuratorFramework(org.apache.flink.shaded.curator5.org.apache.curator.framework.CuratorFramework) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingHeartbeatServices(org.apache.flink.runtime.heartbeat.TestingHeartbeatServices) DispatcherOperationCaches(org.apache.flink.runtime.dispatcher.DispatcherOperationCaches) MemoryExecutionGraphInfoStore(org.apache.flink.runtime.dispatcher.MemoryExecutionGraphInfoStore) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) Test(org.junit.Test)

Example 22 with DispatcherGateway

use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.

the class ZooKeeperDefaultDispatcherRunnerTest method grantLeadership.

private DispatcherGateway grantLeadership(TestingLeaderElectionService dispatcherLeaderElectionService) throws InterruptedException, java.util.concurrent.ExecutionException {
    final UUID leaderSessionId = UUID.randomUUID();
    dispatcherLeaderElectionService.isLeader(leaderSessionId);
    final LeaderConnectionInfo leaderConnectionInfo = dispatcherLeaderElectionService.getConfirmationFuture().get();
    return testingRpcServiceResource.getTestingRpcService().connect(leaderConnectionInfo.getAddress(), DispatcherId.fromUuid(leaderSessionId), DispatcherGateway.class).get();
}
Also used : LeaderConnectionInfo(org.apache.flink.runtime.util.LeaderConnectionInfo) UUID(java.util.UUID) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway)

Example 23 with DispatcherGateway

use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.

the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderChange.

/**
 * Tests that a job can be executed after a new leader has been elected. For all except for the
 * last leader, the job is blocking. The JobManager will be terminated while executing the
 * blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
 * successfully executed.
 */
@Test
@Ignore("FLINK-25235")
public void testJobExecutionOnClusterWithLeaderChange() throws Exception {
    final int numDispatchers = 3;
    final int numTMs = 2;
    final int numSlotsPerTM = 2;
    final Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), tempFolder.newFolder().getAbsolutePath());
    // speed up refused registration retries
    configuration.setLong(ClusterOptions.REFUSED_REGISTRATION_DELAY, 50L);
    final TestingMiniClusterConfiguration miniClusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).setNumberDispatcherResourceManagerComponents(numDispatchers).setNumTaskManagers(numTMs).setNumSlotsPerTaskManager(numSlotsPerTM).build();
    final Deadline timeout = Deadline.fromNow(TEST_TIMEOUT);
    try (TestingMiniCluster miniCluster = TestingMiniCluster.newBuilder(miniClusterConfiguration).build();
        final CuratorFrameworkWithUnhandledErrorListener curatorFramework = ZooKeeperUtils.startCuratorFramework(configuration, exception -> fail("Fatal error in curator framework."))) {
        // We need to watch for resource manager leader changes to avoid race conditions.
        final DefaultLeaderRetrievalService resourceManagerLeaderRetrieval = ZooKeeperUtils.createLeaderRetrievalService(curatorFramework.asCuratorFramework(), ZooKeeperUtils.getLeaderPathForResourceManager(), configuration);
        @SuppressWarnings("unchecked") final CompletableFuture<String>[] resourceManagerLeaderFutures = (CompletableFuture<String>[]) new CompletableFuture[numDispatchers];
        for (int i = 0; i < numDispatchers; i++) {
            resourceManagerLeaderFutures[i] = new CompletableFuture<>();
        }
        resourceManagerLeaderRetrieval.start(new TestLeaderRetrievalListener(resourceManagerLeaderFutures));
        miniCluster.start();
        final int parallelism = numTMs * numSlotsPerTM;
        JobGraph jobGraph = createJobGraph(parallelism);
        miniCluster.submitJob(jobGraph).get();
        String previousLeaderAddress = null;
        for (int i = 0; i < numDispatchers - 1; i++) {
            final DispatcherGateway leaderDispatcherGateway = getNextLeadingDispatcherGateway(miniCluster, previousLeaderAddress, timeout);
            // Make sure resource manager has also changed leadership.
            resourceManagerLeaderFutures[i].get();
            previousLeaderAddress = leaderDispatcherGateway.getAddress();
            awaitRunningStatus(leaderDispatcherGateway, jobGraph, timeout);
            leaderDispatcherGateway.shutDownCluster();
        }
        final DispatcherGateway leaderDispatcherGateway = getNextLeadingDispatcherGateway(miniCluster, previousLeaderAddress, timeout);
        // Make sure resource manager has also changed leadership.
        resourceManagerLeaderFutures[numDispatchers - 1].get();
        awaitRunningStatus(leaderDispatcherGateway, jobGraph, timeout);
        CompletableFuture<JobResult> jobResultFuture = leaderDispatcherGateway.requestJobResult(jobGraph.getJobID(), RPC_TIMEOUT);
        BlockingOperator.unblock();
        assertThat(jobResultFuture.get().isSuccess(), is(true));
        resourceManagerLeaderRetrieval.stop();
    }
}
Also used : TestingMiniCluster(org.apache.flink.runtime.minicluster.TestingMiniCluster) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) JobResult(org.apache.flink.runtime.jobmaster.JobResult) Deadline(org.apache.flink.api.common.time.Deadline) CuratorFrameworkWithUnhandledErrorListener(org.apache.flink.runtime.highavailability.zookeeper.CuratorFrameworkWithUnhandledErrorListener) DefaultLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.DefaultLeaderRetrievalService) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

DispatcherGateway (org.apache.flink.runtime.dispatcher.DispatcherGateway)23 Configuration (org.apache.flink.configuration.Configuration)15 CompletableFuture (java.util.concurrent.CompletableFuture)14 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)11 ExceptionUtils (org.apache.flink.util.ExceptionUtils)11 Collections (java.util.Collections)10 TestingDispatcherGateway (org.apache.flink.runtime.webmonitor.TestingDispatcherGateway)10 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)10 Test (org.junit.Test)10 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)9 JobID (org.apache.flink.api.common.JobID)7 IOException (java.io.IOException)6 Path (java.nio.file.Path)6 Duration (java.time.Duration)6 Optional (java.util.Optional)6 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)6 Before (org.junit.Before)6 ObjectOutputStream (java.io.ObjectOutputStream)5 Files (java.nio.file.Files)5 ArrayList (java.util.ArrayList)5