Search in sources :

Example 16 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class ResourceManagerHATest method testGrantAndRevokeLeadership.

@Test
public void testGrantAndRevokeLeadership() throws Exception {
    final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
    final TestingResourceManagerService resourceManagerService = TestingResourceManagerService.newBuilder().setRmLeaderElectionService(leaderElectionService).build();
    try {
        resourceManagerService.start();
        final UUID leaderId = UUID.randomUUID();
        resourceManagerService.isLeader(leaderId);
        // after grant leadership, verify resource manager is started with the fencing token
        assertEquals(leaderId, leaderElectionService.getConfirmationFuture().get().getLeaderSessionId());
        assertTrue(resourceManagerService.getResourceManagerFencingToken().isPresent());
        assertEquals(leaderId, resourceManagerService.getResourceManagerFencingToken().get().toUUID());
        // then revoke leadership, verify resource manager is closed
        final Optional<CompletableFuture<Void>> rmTerminationFutureOpt = resourceManagerService.getResourceManagerTerminationFuture();
        assertTrue(rmTerminationFutureOpt.isPresent());
        resourceManagerService.notLeader();
        rmTerminationFutureOpt.get().get();
        resourceManagerService.rethrowFatalErrorIfAny();
    } finally {
        resourceManagerService.cleanUp();
    }
}
Also used : TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) CompletableFuture(java.util.concurrent.CompletableFuture) UUID(java.util.UUID) Test(org.junit.Test)

Example 17 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class ResourceManagerTaskExecutorTest method createAndStartResourceManager.

private void createAndStartResourceManager() throws Exception {
    final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
    rmService = TestingResourceManagerService.newBuilder().setRpcService(rpcService).setRmLeaderElectionService(leaderElectionService).build();
    rmService.start();
    rmService.isLeader(UUID.randomUUID());
    leaderElectionService.getConfirmationFuture().thenRun(() -> {
        rmGateway = rmService.getResourceManagerGateway().orElseThrow(() -> new AssertionError("RM not available after confirming leadership."));
    }).get(TIMEOUT.getSize(), TIMEOUT.getUnit());
}
Also used : TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)

Example 18 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class ResourceManagerTest method setup.

@Before
public void setup() throws Exception {
    highAvailabilityServices = new TestingHighAvailabilityServices();
    highAvailabilityServices.setResourceManagerLeaderElectionService(new TestingLeaderElectionService());
    testingFatalErrorHandler = new TestingFatalErrorHandler();
    resourceManagerResourceId = ResourceID.generate();
}
Also used : TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Before(org.junit.Before)

Example 19 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class DispatcherCleanupITCase method testCleanupThroughRetries.

@Test
public void testCleanupThroughRetries() throws Exception {
    final JobGraph jobGraph = createJobGraph();
    final JobID jobId = jobGraph.getJobID();
    // JobGraphStore
    final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
    final OneShotLatch successfulCleanupLatch = new OneShotLatch();
    final int numberOfErrors = 5;
    final RuntimeException temporaryError = new RuntimeException("Expected RuntimeException: Unable to remove job graph.");
    final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(numberOfErrors, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
    haServices.setJobGraphStore(jobGraphStore);
    // Construct leader election service.
    final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
    haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
    // start the dispatcher with enough retries on cleanup
    final JobManagerRunnerRegistry jobManagerRunnerRegistry = new DefaultJobManagerRunnerRegistry(2);
    final Dispatcher dispatcher = createTestingDispatcherBuilder().setResourceCleanerFactory(new DispatcherResourceCleanerFactory(ForkJoinPool.commonPool(), TestingRetryStrategies.createWithNumberOfRetries(numberOfErrors), jobManagerRunnerRegistry, haServices.getJobGraphStore(), blobServer, haServices, UnregisteredMetricGroups.createUnregisteredJobManagerMetricGroup())).build();
    dispatcher.start();
    toTerminate.add(dispatcher);
    leaderElectionService.isLeader(UUID.randomUUID());
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
    waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
    successfulCleanupLatch.await();
    assertThat(actualGlobalCleanupCallCount.get(), equalTo(numberOfErrors + 1));
    assertThat("The JobGraph should be removed from JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
    CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasJobResultEntry(jobId), Deadline.fromNow(Duration.ofMinutes(5)), "The JobResultStore should have this job marked as clean.");
}
Also used : TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) RpcEndpoint(org.apache.flink.runtime.rpc.RpcEndpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) DispatcherResourceCleanerFactory(org.apache.flink.runtime.dispatcher.cleanup.DispatcherResourceCleanerFactory) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 20 with TestingLeaderElectionService

use of org.apache.flink.runtime.leaderelection.TestingLeaderElectionService in project flink by apache.

the class DispatcherCleanupITCase method testCleanupAfterLeadershipChange.

@Test
public void testCleanupAfterLeadershipChange() throws Exception {
    final JobGraph jobGraph = createJobGraph();
    final JobID jobId = jobGraph.getJobID();
    // Construct job graph store.
    final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
    final OneShotLatch successfulCleanupLatch = new OneShotLatch();
    final RuntimeException temporaryError = new RuntimeException("Unable to remove job graph.");
    final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(1, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
    haServices.setJobGraphStore(jobGraphStore);
    // Construct leader election service.
    final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
    haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
    // start the dispatcher with no retries on cleanup
    final CountDownLatch jobGraphRemovalErrorReceived = new CountDownLatch(1);
    final Dispatcher dispatcher = createTestingDispatcherBuilder().setFatalErrorHandler(throwable -> {
        final Optional<Throwable> maybeError = ExceptionUtils.findThrowable(throwable, temporaryError::equals);
        if (maybeError.isPresent()) {
            jobGraphRemovalErrorReceived.countDown();
        } else {
            testingFatalErrorHandlerResource.getFatalErrorHandler().onFatalError(throwable);
        }
    }).build();
    dispatcher.start();
    toTerminate.add(dispatcher);
    leaderElectionService.isLeader(UUID.randomUUID());
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
    waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
    jobGraphRemovalErrorReceived.await();
    // Remove job master leadership.
    leaderElectionService.notLeader();
    // This will clear internal state of election service, so a new contender can register.
    leaderElectionService.stop();
    assertThat(successfulCleanupLatch.isTriggered(), CoreMatchers.is(false));
    assertThat("The JobGraph is still stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), CoreMatchers.is(Collections.singleton(jobId)));
    assertThat("The JobResultStore has this job marked as dirty.", haServices.getJobResultStore().getDirtyResults().stream().map(JobResult::getJobId).collect(Collectors.toSet()), CoreMatchers.is(Collections.singleton(jobId)));
    // Run a second dispatcher, that restores our finished job.
    final Dispatcher secondDispatcher = createTestingDispatcherBuilder().setRecoveredDirtyJobs(haServices.getJobResultStore().getDirtyResults()).build();
    secondDispatcher.start();
    toTerminate.add(secondDispatcher);
    leaderElectionService.isLeader(UUID.randomUUID());
    CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().getDirtyResults().isEmpty(), Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)));
    assertThat("The JobGraph is not stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
    assertTrue("The JobResultStore has the job listed as clean.", haServices.getJobResultStore().hasJobResultEntry(jobId));
    // wait for the successful cleanup to be triggered
    successfulCleanupLatch.await();
    assertThat(actualGlobalCleanupCallCount.get(), equalTo(2));
}
Also used : CoreMatchers(org.hamcrest.CoreMatchers) Deadline(org.apache.flink.api.common.time.Deadline) RpcEndpoint(org.apache.flink.runtime.rpc.RpcEndpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) IsEqual.equalTo(org.hamcrest.core.IsEqual.equalTo) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExceptionUtils(org.apache.flink.util.ExceptionUtils) IsEmptyCollection(org.hamcrest.collection.IsEmptyCollection) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) After(org.junit.After) Duration(java.time.Duration) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) DispatcherResourceCleanerFactory(org.apache.flink.runtime.dispatcher.cleanup.DispatcherResourceCleanerFactory) BlockingQueue(java.util.concurrent.BlockingQueue) UUID(java.util.UUID) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) TimeUtils(org.apache.flink.util.TimeUtils) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) Optional(java.util.Optional) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) AtomicReference(java.util.concurrent.atomic.AtomicReference) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Before(org.junit.Before) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionException(java.util.concurrent.ExecutionException) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) TestingRetryStrategies(org.apache.flink.runtime.dispatcher.cleanup.TestingRetryStrategies) ForkJoinPool(java.util.concurrent.ForkJoinPool) EmbeddedCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.EmbeddedCompletedCheckpointStore) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Assert(org.junit.Assert) Collections(java.util.Collections) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Optional(java.util.Optional) JobResult(org.apache.flink.runtime.jobmaster.JobResult) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

TestingLeaderElectionService (org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)25 Test (org.junit.Test)13 UUID (java.util.UUID)10 JobID (org.apache.flink.api.common.JobID)9 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)8 Before (org.junit.Before)8 Configuration (org.apache.flink.configuration.Configuration)6 CompletableFuture (java.util.concurrent.CompletableFuture)5 Time (org.apache.flink.api.common.time.Time)5 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)5 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 EmbeddedJobResultStore (org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore)4 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)4 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)3 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)3 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 JobGraphStore (org.apache.flink.runtime.jobmanager.JobGraphStore)3