use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.
the class DispatcherCleanupITCase method testCleanupAfterLeadershipChange.
@Test
public void testCleanupAfterLeadershipChange() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
// Construct job graph store.
final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
final OneShotLatch successfulCleanupLatch = new OneShotLatch();
final RuntimeException temporaryError = new RuntimeException("Unable to remove job graph.");
final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(1, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
haServices.setJobGraphStore(jobGraphStore);
// Construct leader election service.
final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
// start the dispatcher with no retries on cleanup
final CountDownLatch jobGraphRemovalErrorReceived = new CountDownLatch(1);
final Dispatcher dispatcher = createTestingDispatcherBuilder().setFatalErrorHandler(throwable -> {
final Optional<Throwable> maybeError = ExceptionUtils.findThrowable(throwable, temporaryError::equals);
if (maybeError.isPresent()) {
jobGraphRemovalErrorReceived.countDown();
} else {
testingFatalErrorHandlerResource.getFatalErrorHandler().onFatalError(throwable);
}
}).build();
dispatcher.start();
toTerminate.add(dispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
jobGraphRemovalErrorReceived.await();
// Remove job master leadership.
leaderElectionService.notLeader();
// This will clear internal state of election service, so a new contender can register.
leaderElectionService.stop();
assertThat(successfulCleanupLatch.isTriggered(), CoreMatchers.is(false));
assertThat("The JobGraph is still stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), CoreMatchers.is(Collections.singleton(jobId)));
assertThat("The JobResultStore has this job marked as dirty.", haServices.getJobResultStore().getDirtyResults().stream().map(JobResult::getJobId).collect(Collectors.toSet()), CoreMatchers.is(Collections.singleton(jobId)));
// Run a second dispatcher, that restores our finished job.
final Dispatcher secondDispatcher = createTestingDispatcherBuilder().setRecoveredDirtyJobs(haServices.getJobResultStore().getDirtyResults()).build();
secondDispatcher.start();
toTerminate.add(secondDispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().getDirtyResults().isEmpty(), Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)));
assertThat("The JobGraph is not stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
assertTrue("The JobResultStore has the job listed as clean.", haServices.getJobResultStore().hasJobResultEntry(jobId));
// wait for the successful cleanup to be triggered
successfulCleanupLatch.await();
assertThat(actualGlobalCleanupCallCount.get(), equalTo(2));
}
use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.
the class DispatcherResourceCleanupTest method testErrorHandlingIfJobCannotBeMarkedAsCleanInJobResultStore.
@Test
public void testErrorHandlingIfJobCannotBeMarkedAsCleanInJobResultStore() throws Exception {
final CompletableFuture<JobResultEntry> dirtyJobFuture = new CompletableFuture<>();
final JobResultStore jobResultStore = TestingJobResultStore.builder().withCreateDirtyResultConsumer(dirtyJobFuture::complete).withMarkResultAsCleanConsumer(jobId -> {
throw new IOException("Expected IOException.");
}).build();
final TestingJobManagerRunnerFactory jobManagerRunnerFactory = startDispatcherAndSubmitJob(createTestingDispatcherBuilder().setJobResultStore(jobResultStore), 0);
ArchivedExecutionGraph executionGraph = new ArchivedExecutionGraphBuilder().setJobID(jobId).setState(JobStatus.FINISHED).build();
final TestingJobManagerRunner testingJobManagerRunner = jobManagerRunnerFactory.takeCreatedJobManagerRunner();
testingJobManagerRunner.completeResultFuture(new ExecutionGraphInfo(executionGraph));
final CompletableFuture<? extends Throwable> errorFuture = this.testingFatalErrorHandlerResource.getFatalErrorHandler().getErrorFuture();
try {
final Throwable unexpectedError = errorFuture.get(100, TimeUnit.MILLISECONDS);
fail("No error should have been reported but an " + unexpectedError.getClass() + " was handled.");
} catch (TimeoutException e) {
// expected
}
assertThat(dirtyJobFuture.get().getJobId(), is(jobId));
}
use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.
the class DispatcherResourceCleanupTest method testFatalErrorIfJobCannotBeMarkedDirtyInJobResultStore.
@Test
public void testFatalErrorIfJobCannotBeMarkedDirtyInJobResultStore() throws Exception {
final JobResultStore jobResultStore = TestingJobResultStore.builder().withCreateDirtyResultConsumer(jobResult -> {
throw new IOException("Expected IOException.");
}).build();
final TestingJobManagerRunnerFactory jobManagerRunnerFactory = startDispatcherAndSubmitJob(createTestingDispatcherBuilder().setJobResultStore(jobResultStore), 0);
ArchivedExecutionGraph executionGraph = new ArchivedExecutionGraphBuilder().setJobID(jobId).setState(JobStatus.FINISHED).build();
final TestingJobManagerRunner testingJobManagerRunner = jobManagerRunnerFactory.takeCreatedJobManagerRunner();
testingJobManagerRunner.completeResultFuture(new ExecutionGraphInfo(executionGraph));
final CompletableFuture<? extends Throwable> errorFuture = this.testingFatalErrorHandlerResource.getFatalErrorHandler().getErrorFuture();
assertThat(errorFuture.get(100, TimeUnit.MILLISECONDS), IsInstanceOf.instanceOf(FlinkException.class));
testingFatalErrorHandlerResource.getFatalErrorHandler().clearError();
}
use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.
the class DispatcherResourceCleanupTest method testJobBeingMarkedAsCleanAfterCleanup.
@Test
public void testJobBeingMarkedAsCleanAfterCleanup() throws Exception {
final CompletableFuture<JobID> markAsCleanFuture = new CompletableFuture<>();
final JobResultStore jobResultStore = TestingJobResultStore.builder().withMarkResultAsCleanConsumer(markAsCleanFuture::complete).build();
final OneShotLatch localCleanupLatch = new OneShotLatch();
final OneShotLatch globalCleanupLatch = new OneShotLatch();
final TestingResourceCleanerFactory resourceCleanerFactory = TestingResourceCleanerFactory.builder().withLocallyCleanableResource((ignoredJobId, ignoredExecutor) -> {
try {
localCleanupLatch.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
return FutureUtils.completedVoidFuture();
}).withGloballyCleanableResource((ignoredJobId, ignoredExecutor) -> {
try {
globalCleanupLatch.await();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
return FutureUtils.completedVoidFuture();
}).build();
final TestingDispatcher.Builder dispatcherBuilder = createTestingDispatcherBuilder().setJobResultStore(jobResultStore).setResourceCleanerFactory(resourceCleanerFactory);
final TestingJobManagerRunnerFactory jobManagerRunnerFactory = startDispatcherAndSubmitJob(dispatcherBuilder, 0);
finishJob(jobManagerRunnerFactory.takeCreatedJobManagerRunner());
assertThat(markAsCleanFuture.isDone(), is(false));
localCleanupLatch.trigger();
assertThat(markAsCleanFuture.isDone(), is(false));
globalCleanupLatch.trigger();
assertThat(markAsCleanFuture.get(), is(jobId));
}
use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.
the class ZooKeeperDefaultDispatcherRunnerTest method testResourceCleanupUnderLeadershipChange.
/**
* See FLINK-11665.
*/
@Test
public void testResourceCleanupUnderLeadershipChange() throws Exception {
final TestingRpcService rpcService = testingRpcServiceResource.getTestingRpcService();
final TestingLeaderElectionService dispatcherLeaderElectionService = new TestingLeaderElectionService();
final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration, fatalErrorHandler).asCuratorFramework();
try (final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder().setDispatcherLeaderElectionService(dispatcherLeaderElectionService).setJobMasterLeaderRetrieverFunction(jobId -> ZooKeeperUtils.createLeaderRetrievalService(client)).build()) {
final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, CompletableFuture::new, blobServer, new TestingHeartbeatServices(), UnregisteredMetricGroups::createUnregisteredJobManagerMetricGroup, new MemoryExecutionGraphInfoStore(), fatalErrorHandler, VoidHistoryServerArchivist.INSTANCE, null, ForkJoinPool.commonPool(), new DispatcherOperationCaches());
final DefaultDispatcherRunnerFactory defaultDispatcherRunnerFactory = DefaultDispatcherRunnerFactory.createSessionRunner(SessionDispatcherFactory.INSTANCE);
try (final DispatcherRunner dispatcherRunner = createDispatcherRunner(rpcService, dispatcherLeaderElectionService, new JobPersistenceComponentFactory() {
@Override
public JobGraphStore createJobGraphStore() {
return createZooKeeperJobGraphStore(client);
}
@Override
public JobResultStore createJobResultStore() {
return new EmbeddedJobResultStore();
}
}, partialDispatcherServices, defaultDispatcherRunnerFactory)) {
// initial run
DispatcherGateway dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
final JobGraph jobGraph = createJobGraphWithBlobs();
LOG.info("Initial job submission {}.", jobGraph.getJobID());
dispatcherGateway.submitJob(jobGraph, TESTING_TIMEOUT).get();
dispatcherLeaderElectionService.notLeader();
// recovering submitted jobs
LOG.info("Re-grant leadership first time.");
dispatcherGateway = grantLeadership(dispatcherLeaderElectionService);
LOG.info("Cancel recovered job {}.", jobGraph.getJobID());
// cancellation of the job should remove everything
final CompletableFuture<JobResult> jobResultFuture = dispatcherGateway.requestJobResult(jobGraph.getJobID(), TESTING_TIMEOUT);
dispatcherGateway.cancelJob(jobGraph.getJobID(), TESTING_TIMEOUT).get();
// a successful cancellation should eventually remove all job information
final JobResult jobResult = jobResultFuture.get();
assertThat(jobResult.getApplicationStatus(), is(ApplicationStatus.CANCELED));
dispatcherLeaderElectionService.notLeader();
// check that the job has been removed from ZooKeeper
final JobGraphStore submittedJobGraphStore = createZooKeeperJobGraphStore(client);
CommonTestUtils.waitUntilCondition(() -> submittedJobGraphStore.getJobIds().isEmpty(), Deadline.fromNow(VERIFICATION_TIMEOUT), 20L);
}
}
// check resource clean up
assertThat(clusterHaStorageDir.listFiles(), is(emptyArray()));
}
Aggregations