use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.
/**
* See FLINK-11665.
*/
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {
@Override
public JobGraphStore createJobGraphStore() {
return createZooKeeperJobGraphStore(client);
}
@Override
public JobResultStore createJobResultStore() {
return new EmbeddedJobResultStore();
}
}, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
// initial run
DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
final JobGraph jobGraph = createJobGraphWithBlobs();
LOG.info("Initial job submission {}.", jobGraph.getJobID());
dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
dispatcherLeaderElectionService.notLeader();
// recovering submitted jobs
LOG.info("Re-grant leadership first time.");
dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
// cancellation of the job should remove everything
final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
// a successful cancellation should eventually remove all job information
final JobResult jobResult = jobResultFuture.get();
assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
dispatcherLeaderElectionService.notLeader();
// check that the job has been removed from ZooKeeper
final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
}
}
// check resource clean up
assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class ZooKeeperDefaultDispatcherRunnerTest method grantLeadership.
private DispatcherGateway grantLeadership(TestingLeaderElectionService dispatcherLeaderElectionService) throws InterruptedException, java.util.concurrent.ExecutionException {
final UUID leaderSessionId = UUID.randomUUID();
dispatcherLeaderElectionService.isLeader(leaderSessionId);
final LeaderConnectionInfo leaderConnectionInfo = dispatcherLeaderElectionService.getConfirmationFuture().get();
return testingRpcServiceResource.getTestingRpcService().connect(leaderConnectionInfo.getAddress(), DispatcherId.fromUuid(leaderSessionId), DispatcherGateway.class).get();
}
use of org.apache.flink.runtime.dispatcher.DispatcherGateway in project flink by apache.
the class ZooKeeperLeaderElectionITCase method testJobExecutionOnClusterWithLeaderChange.
/**
* Tests that a job can be executed after a new leader has been elected. For all except for the
* last leader, the job is blocking. The JobManager will be terminated while executing the
* blocking job. Once only one JobManager is left, it is checked that a non-blocking can be
* successfully executed.
*/
@Test
@Ignore("FLINK-25235")
public void testJobExecutionOnClusterWithLeaderChange() throws Exception {
final int numDispatchers = 3;
final int numTMs = 2;
final int numSlotsPerTM = 2;
final Configuration configuration = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), tempFolder.newFolder().getAbsolutePath());
// speed up refused registration retries
configuration.setLong(ClusterOptions.REFUSED_REGISTRATION_DELAY, 50L);
final TestingMiniClusterConfiguration miniClusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).setNumberDispatcherResourceManagerComponents(numDispatchers).setNumTaskManagers(numTMs).setNumSlotsPerTaskManager(numSlotsPerTM).build();
final Deadline timeout = Deadline.fromNow(TEST_TIMEOUT);
try (TestingMiniCluster miniCluster = TestingMiniCluster.newBuilder(miniClusterConfiguration).build();
final CuratorFrameworkWithUnhandledErrorListener curatorFramework = ZooKeeperUtils.startCuratorFramework(configuration, exception -> fail("Fatal error in curator framework."))) {
// We need to watch for resource manager leader changes to avoid race conditions.
final DefaultLeaderRetrievalService resourceManagerLeaderRetrieval = ZooKeeperUtils.createLeaderRetrievalService(curatorFramework.asCuratorFramework(), ZooKeeperUtils.getLeaderPathForResourceManager(), configuration);
@SuppressWarnings("unchecked") final CompletableFuture<String>[] resourceManagerLeaderFutures = (CompletableFuture<String>[]) new CompletableFuture[numDispatchers];
for (int i = 0; i < numDispatchers; i++) {
resourceManagerLeaderFutures[i] = new CompletableFuture<>();
}
resourceManagerLeaderRetrieval.start(new TestLeaderRetrievalListener(resourceManagerLeaderFutures));
miniCluster.start();
final int parallelism = numTMs * numSlotsPerTM;
JobGraph jobGraph = createJobGraph(parallelism);
miniCluster.submitJob(jobGraph).get();
String previousLeaderAddress = null;
for (int i = 0; i < numDispatchers - 1; i++) {
final DispatcherGateway leaderDispatcherGateway = getNextLeadingDispatcherGateway(miniCluster, previousLeaderAddress, timeout);
// Make sure resource manager has also changed leadership.
resourceManagerLeaderFutures[i].get();
previousLeaderAddress = leaderDispatcherGateway.getAddress();
awaitRunningStatus(leaderDispatcherGateway, jobGraph, timeout);
leaderDispatcherGateway.shutDownCluster();
}
final DispatcherGateway leaderDispatcherGateway = getNextLeadingDispatcherGateway(miniCluster, previousLeaderAddress, timeout);
// Make sure resource manager has also changed leadership.
resourceManagerLeaderFutures[numDispatchers - 1].get();
awaitRunningStatus(leaderDispatcherGateway, jobGraph, timeout);
CompletableFuture<JobResult> jobResultFuture = leaderDispatcherGateway.requestJobResult(jobGraph.getJobID(), RPC_TIMEOUT);
BlockingOperator.unblock();
assertThat(jobResultFuture.get().isSuccess(), is(true));
resourceManagerLeaderRetrieval.stop();
}
}
Aggregations