use of org.apache.flink.runtime.jobmanager.JobGraphStore in project flink by apache.
the class SessionDispatcherLeaderProcessTest method recoverJobs_withRecoveryFailure_failsFatally.
@Test
public void recoverJobs_withRecoveryFailure_failsFatally() throws Exception {
final FlinkException testException = new FlinkException("Test exception");
jobGraphStore = TestingJobGraphStore.newBuilder().setRecoverJobGraphFunction((ignoredA, ignoredB) -> {
throw testException;
}).setInitialJobGraphs(Collections.singleton(JOB_GRAPH)).build();
runJobRecoveryFailureTest(testException);
}
use of org.apache.flink.runtime.jobmanager.JobGraphStore in project flink by apache.
the class JobDispatcherLeaderProcessFactoryFactoryTest method createDispatcherLeaderProcessFactoryFromTestInstance.
private static JobDispatcherLeaderProcessFactory createDispatcherLeaderProcessFactoryFromTestInstance(@Nullable JobGraph jobGraph, @Nullable JobResult dirtyJobResult, Path storageDir) throws IOException {
final JobDispatcherLeaderProcessFactoryFactory testInstance = new JobDispatcherLeaderProcessFactoryFactory(ignoredConfig -> jobGraph);
final TestingJobResultStore jobResultStore = TestingJobResultStore.builder().withGetDirtyResultsSupplier(() -> CollectionUtil.ofNullable(dirtyJobResult)).build();
final JobGraphStore jobGraphStore = new StandaloneJobGraphStore();
return testInstance.createFactory(new TestingJobPersistenceComponentFactory(jobGraphStore, jobResultStore), Executors.directExecutor(), new TestingRpcService(), TestingPartialDispatcherServices.builder().withHighAvailabilityServices(new TestingHighAvailabilityServicesBuilder().setJobGraphStore(jobGraphStore).setJobResultStore(jobResultStore).build()).build(storageDir.toFile(), new Configuration()), NoOpFatalErrorHandler.INSTANCE);
}
use of org.apache.flink.runtime.jobmanager.JobGraphStore in project flink by apache.
the class DispatcherCleanupITCase method createAndStartJobGraphStoreWithCleanupFailures.
private JobGraphStore createAndStartJobGraphStoreWithCleanupFailures(int numberOfCleanupFailures, Throwable throwable, AtomicInteger actualCleanupCallCount, OneShotLatch successfulCleanupLatch) throws Exception {
final AtomicInteger failureCount = new AtomicInteger(numberOfCleanupFailures);
final JobGraphStore jobGraphStore = TestingJobGraphStore.newBuilder().setGlobalCleanupFunction((ignoredJobId, ignoredExecutor) -> {
actualCleanupCallCount.incrementAndGet();
if (failureCount.getAndDecrement() > 0) {
return FutureUtils.completedExceptionally(throwable);
}
successfulCleanupLatch.trigger();
return FutureUtils.completedVoidFuture();
}).build();
jobGraphStore.start(null);
return jobGraphStore;
}
use of org.apache.flink.runtime.jobmanager.JobGraphStore in project flink by apache.
the class DispatcherCleanupITCase method testCleanupThroughRetries.
@Test
public void testCleanupThroughRetries() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
// JobGraphStore
final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
final OneShotLatch successfulCleanupLatch = new OneShotLatch();
final int numberOfErrors = 5;
final RuntimeException temporaryError = new RuntimeException("Expected RuntimeException: Unable to remove job graph.");
final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(numberOfErrors, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
haServices.setJobGraphStore(jobGraphStore);
// Construct leader election service.
final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
// start the dispatcher with enough retries on cleanup
final JobManagerRunnerRegistry jobManagerRunnerRegistry = new DefaultJobManagerRunnerRegistry(2);
final Dispatcher dispatcher = createTestingDispatcherBuilder().setResourceCleanerFactory(new DispatcherResourceCleanerFactory(ForkJoinPool.commonPool(), TestingRetryStrategies.createWithNumberOfRetries(numberOfErrors), jobManagerRunnerRegistry, haServices.getJobGraphStore(), blobServer, haServices, UnregisteredMetricGroups.createUnregisteredJobManagerMetricGroup())).build();
dispatcher.start();
toTerminate.add(dispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
successfulCleanupLatch.await();
assertThat(actualGlobalCleanupCallCount.get(), equalTo(numberOfErrors + 1));
assertThat("The JobGraph should be removed from JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasJobResultEntry(jobId), Deadline.fromNow(Duration.ofMinutes(5)), "The JobResultStore should have this job marked as clean.");
}
use of org.apache.flink.runtime.jobmanager.JobGraphStore in project flink by apache.
the class DispatcherCleanupITCase method testCleanupAfterLeadershipChange.
@Test
public void testCleanupAfterLeadershipChange() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
// Construct job graph store.
final AtomicInteger actualGlobalCleanupCallCount = new AtomicInteger();
final OneShotLatch successfulCleanupLatch = new OneShotLatch();
final RuntimeException temporaryError = new RuntimeException("Unable to remove job graph.");
final JobGraphStore jobGraphStore = createAndStartJobGraphStoreWithCleanupFailures(1, temporaryError, actualGlobalCleanupCallCount, successfulCleanupLatch);
haServices.setJobGraphStore(jobGraphStore);
// Construct leader election service.
final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
haServices.setJobMasterLeaderElectionService(jobId, leaderElectionService);
// start the dispatcher with no retries on cleanup
final CountDownLatch jobGraphRemovalErrorReceived = new CountDownLatch(1);
final Dispatcher dispatcher = createTestingDispatcherBuilder().setFatalErrorHandler(throwable -> {
final Optional<Throwable> maybeError = ExceptionUtils.findThrowable(throwable, temporaryError::equals);
if (maybeError.isPresent()) {
jobGraphRemovalErrorReceived.countDown();
} else {
testingFatalErrorHandlerResource.getFatalErrorHandler().onFatalError(throwable);
}
}).build();
dispatcher.start();
toTerminate.add(dispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
dispatcherGateway.submitJob(jobGraph, TIMEOUT).get();
waitForJobToFinish(leaderElectionService, dispatcherGateway, jobId);
jobGraphRemovalErrorReceived.await();
// Remove job master leadership.
leaderElectionService.notLeader();
// This will clear internal state of election service, so a new contender can register.
leaderElectionService.stop();
assertThat(successfulCleanupLatch.isTriggered(), CoreMatchers.is(false));
assertThat("The JobGraph is still stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), CoreMatchers.is(Collections.singleton(jobId)));
assertThat("The JobResultStore has this job marked as dirty.", haServices.getJobResultStore().getDirtyResults().stream().map(JobResult::getJobId).collect(Collectors.toSet()), CoreMatchers.is(Collections.singleton(jobId)));
// Run a second dispatcher, that restores our finished job.
final Dispatcher secondDispatcher = createTestingDispatcherBuilder().setRecoveredDirtyJobs(haServices.getJobResultStore().getDirtyResults()).build();
secondDispatcher.start();
toTerminate.add(secondDispatcher);
leaderElectionService.isLeader(UUID.randomUUID());
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().getDirtyResults().isEmpty(), Deadline.fromNow(TimeUtils.toDuration(TIMEOUT)));
assertThat("The JobGraph is not stored in the JobGraphStore.", haServices.getJobGraphStore().getJobIds(), IsEmptyCollection.empty());
assertTrue("The JobResultStore has the job listed as clean.", haServices.getJobResultStore().hasJobResultEntry(jobId));
// wait for the successful cleanup to be triggered
successfulCleanupLatch.await();
assertThat(actualGlobalCleanupCallCount.get(), equalTo(2));
}
Aggregations