use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder in project flink by apache.
the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.
/**
* See FLINK-11665.
*/
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {
@Override
public JobGraphStore createJobGraphStore() {
return createZooKeeperJobGraphStore(client);
}
@Override
public JobResultStore createJobResultStore() {
return new EmbeddedJobResultStore();
}
}, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
// initial run
DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
final JobGraph jobGraph = createJobGraphWithBlobs();
LOG.info("Initial job submission {}.", jobGraph.getJobID());
dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
dispatcherLeaderElectionService.notLeader();
// recovering submitted jobs
LOG.info("Re-grant leadership first time.");
dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
// cancellation of the job should remove everything
final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
// a successful cancellation should eventually remove all job information
final JobResult jobResult = jobResultFuture.get();
assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
dispatcherLeaderElectionService.notLeader();
// check that the job has been removed from ZooKeeper
final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
}
}
// check resource clean up
assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder in project flink by apache.
the class ClusterEntrypointTest method testClusterFinishedNormallyShouldDeregisterAppAndCleanupHAData.
@Test
public void testClusterFinishedNormallyShouldDeregisterAppAndCleanupHAData() throws Exception {
final CompletableFuture<Void> deregisterFuture = new CompletableFuture<>();
final CompletableFuture<Void> closeAndCleanupAllDataFuture = new CompletableFuture<>();
final CompletableFuture<ApplicationStatus> dispatcherShutDownFuture = new CompletableFuture<>();
final HighAvailabilityServices testingHaService = new TestingHighAvailabilityServicesBuilder().setCloseAndCleanupAllDataFuture(closeAndCleanupAllDataFuture).build();
final TestingResourceManagerFactory testingResourceManagerFactory = new TestingResourceManagerFactory.Builder().setInternalDeregisterApplicationConsumer((ignored1, ignored2, ignore3) -> deregisterFuture.complete(null)).setInitializeConsumer((ignore) -> dispatcherShutDownFuture.complete(ApplicationStatus.SUCCEEDED)).build();
final TestingDispatcherRunnerFactory testingDispatcherRunnerFactory = new TestingDispatcherRunnerFactory.Builder().setShutDownFuture(dispatcherShutDownFuture).build();
final TestingEntryPoint testingEntryPoint = new TestingEntryPoint.Builder().setConfiguration(flinkConfig).setResourceManagerFactory(testingResourceManagerFactory).setDispatcherRunnerFactory(testingDispatcherRunnerFactory).setHighAvailabilityServices(testingHaService).build();
final CompletableFuture<ApplicationStatus> appStatusFuture = startClusterEntrypoint(testingEntryPoint);
assertThat(appStatusFuture.get(TIMEOUT_MS, TimeUnit.MILLISECONDS), is(ApplicationStatus.SUCCEEDED));
assertThat(deregisterFuture.isDone(), is(true));
assertThat(closeAndCleanupAllDataFuture.isDone(), is(true));
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder in project flink by apache.
the class DefaultJobLeaderServiceTest method handlesConcurrentJobAdditionsAndLeaderChanges.
/**
* Tests that we can concurrently modify the JobLeaderService and complete the leader retrieval
* operation. See FLINK-16373.
*/
@Test
public void handlesConcurrentJobAdditionsAndLeaderChanges() throws Exception {
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(new LocalUnresolvedTaskManagerLocation(), RetryingRegistrationConfiguration.defaultConfiguration());
final TestingJobLeaderListener jobLeaderListener = new TestingJobLeaderListener();
final int numberOperations = 20;
final BlockingQueue<SettableLeaderRetrievalService> instantiatedLeaderRetrievalServices = new ArrayBlockingQueue<>(numberOperations);
final HighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(leaderForJobId -> {
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
instantiatedLeaderRetrievalServices.offer(leaderRetrievalService);
return leaderRetrievalService;
}).build();
jobLeaderService.start("foobar", rpcServiceResource.getTestingRpcService(), haServices, jobLeaderListener);
final CheckedThread addJobAction = new CheckedThread() {
@Override
public void go() throws Exception {
for (int i = 0; i < numberOperations; i++) {
final JobID jobId = JobID.generate();
jobLeaderService.addJob(jobId, "foobar");
Thread.yield();
jobLeaderService.removeJob(jobId);
}
}
};
addJobAction.start();
for (int i = 0; i < numberOperations; i++) {
final SettableLeaderRetrievalService leaderRetrievalService = instantiatedLeaderRetrievalServices.take();
leaderRetrievalService.notifyListener("foobar", UUID.randomUUID());
}
addJobAction.sync();
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder in project flink by apache.
the class DefaultJobLeaderServiceTest method doesNotReconnectAfterTargetLostLeadership.
/**
* Tests that the JobLeaderService won't try to reconnect to JobMaster after it has lost the
* leadership. See FLINK-16836.
*/
@Test
public void doesNotReconnectAfterTargetLostLeadership() throws Exception {
final JobID jobId = new JobID();
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(ignored -> leaderRetrievalService).build();
final TestingJobMasterGateway jobMasterGateway = registerJobMaster();
final OneShotLatch jobManagerGainedLeadership = new OneShotLatch();
final TestingJobLeaderListener testingJobLeaderListener = new TestingJobLeaderListener(ignored -> jobManagerGainedLeadership.trigger());
final JobLeaderService jobLeaderService = createAndStartJobLeaderService(haServices, testingJobLeaderListener);
try {
jobLeaderService.addJob(jobId, jobMasterGateway.getAddress());
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), UUID.randomUUID());
jobManagerGainedLeadership.await();
// revoke the leadership
leaderRetrievalService.notifyListener(null, null);
testingJobLeaderListener.waitUntilJobManagerLostLeadership();
jobLeaderService.reconnect(jobId);
} finally {
jobLeaderService.stop();
}
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder in project flink by apache.
the class DefaultJobLeaderServiceTest method canReconnectToOldLeaderWithSameLeaderAddress.
/**
* Tests that the JobLeaderService can reconnect to an old leader which seemed to have lost the
* leadership in between. See FLINK-14316.
*/
@Test
public void canReconnectToOldLeaderWithSameLeaderAddress() throws Exception {
final JobID jobId = new JobID();
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(ignored -> leaderRetrievalService).build();
final TestingJobMasterGateway jobMasterGateway = registerJobMaster();
final BlockingQueue<JobID> leadershipQueue = new ArrayBlockingQueue<>(1);
final TestingJobLeaderListener testingJobLeaderListener = new TestingJobLeaderListener(leadershipQueue::offer);
final JobLeaderService jobLeaderService = createAndStartJobLeaderService(haServices, testingJobLeaderListener);
try {
jobLeaderService.addJob(jobId, jobMasterGateway.getAddress());
final UUID leaderSessionId = UUID.randomUUID();
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), leaderSessionId);
// wait for the first leadership
assertThat(leadershipQueue.take(), is(jobId));
// revoke the leadership
leaderRetrievalService.notifyListener(null, null);
testingJobLeaderListener.waitUntilJobManagerLostLeadership();
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), leaderSessionId);
// check that we obtain the leadership a second time
assertThat(leadershipQueue.take(), is(jobId));
} finally {
jobLeaderService.stop();
}
}
Aggregations