use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.
the class DispatcherCleanupITCase method testCleanupNotCancellable.
@Test
public void testCleanupNotCancellable() throws Exception {
final JobGraph jobGraph = createJobGraph();
final JobID jobId = jobGraph.getJobID();
final JobResultStore jobResultStore = new EmbeddedJobResultStore();
jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(jobId)));
haServices.setJobResultStore(jobResultStore);
// Instantiates JobManagerRunner
final CompletableFuture<Void> jobManagerRunnerCleanupFuture = new CompletableFuture<>();
final AtomicReference<JobManagerRunner> jobManagerRunnerEntry = new AtomicReference<>();
final JobManagerRunnerRegistry jobManagerRunnerRegistry = TestingJobManagerRunnerRegistry.newSingleJobBuilder(jobManagerRunnerEntry).withLocalCleanupAsyncFunction((actualJobId, executor) -> jobManagerRunnerCleanupFuture).build();
final Dispatcher dispatcher = createTestingDispatcherBuilder().setJobManagerRunnerRegistry(jobManagerRunnerRegistry).build();
dispatcher.start();
toTerminate.add(dispatcher);
CommonTestUtils.waitUntilCondition(() -> jobManagerRunnerEntry.get() != null, Deadline.fromNow(Duration.ofSeconds(10)), "JobManagerRunner wasn't loaded in time.");
assertThat("The JobResultStore should have this job still marked as dirty.", haServices.getJobResultStore().hasDirtyJobResultEntry(jobId), CoreMatchers.is(true));
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
try {
dispatcherGateway.cancelJob(jobId, TIMEOUT).get();
Assert.fail("Should fail because cancelling the cleanup is not allowed.");
} catch (ExecutionException e) {
assertThat(e, FlinkMatchers.containsCause(JobCancellationFailedException.class));
}
jobManagerRunnerCleanupFuture.complete(null);
CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasCleanJobResultEntry(jobId), Deadline.fromNow(Duration.ofSeconds(60)), "The JobResultStore should have this job marked as clean now.");
}
use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.
the class Dispatcher method jobReachedTerminalState.
protected CleanupJobState jobReachedTerminalState(ExecutionGraphInfo executionGraphInfo) {
final ArchivedExecutionGraph archivedExecutionGraph = executionGraphInfo.getArchivedExecutionGraph();
final JobStatus terminalJobStatus = archivedExecutionGraph.getState();
Preconditions.checkArgument(terminalJobStatus.isTerminalState(), "Job %s is in state %s which is not terminal.", archivedExecutionGraph.getJobID(), terminalJobStatus);
// the failureInfo contains the reason for why job was failed/suspended, but for
// finished/canceled jobs it may contain the last cause of a restart (if there were any)
// for finished/canceled jobs we don't want to print it because it is misleading
final boolean isFailureInfoRelatedToJobTermination = terminalJobStatus == JobStatus.SUSPENDED || terminalJobStatus == JobStatus.FAILED;
if (archivedExecutionGraph.getFailureInfo() != null && isFailureInfoRelatedToJobTermination) {
log.info("Job {} reached terminal state {}.\n{}", archivedExecutionGraph.getJobID(), terminalJobStatus, archivedExecutionGraph.getFailureInfo().getExceptionAsString().trim());
} else {
log.info("Job {} reached terminal state {}.", archivedExecutionGraph.getJobID(), terminalJobStatus);
}
archiveExecutionGraph(executionGraphInfo);
if (terminalJobStatus.isGloballyTerminalState()) {
final JobID jobId = executionGraphInfo.getJobId();
try {
if (jobResultStore.hasCleanJobResultEntry(jobId)) {
log.warn("Job {} is already marked as clean but clean up was triggered again.", jobId);
} else if (!jobResultStore.hasDirtyJobResultEntry(jobId)) {
jobResultStore.createDirtyResult(new JobResultEntry(JobResult.createFrom(executionGraphInfo.getArchivedExecutionGraph())));
log.info("Job {} has been registered for cleanup in the JobResultStore after reaching a terminal state.", jobId);
}
} catch (IOException e) {
fatalErrorHandler.onFatalError(new FlinkException(String.format("The job %s couldn't be marked as pre-cleanup finished in JobResultStore.", jobId), e));
}
}
return terminalJobStatus.isGloballyTerminalState() ? CleanupJobState.GLOBAL : CleanupJobState.LOCAL;
}
use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.
the class ApplicationDispatcherBootstrapITCase method testDirtyJobResultRecoveryInApplicationMode.
@Test
public void testDirtyJobResultRecoveryInApplicationMode() throws Exception {
final Deadline deadline = Deadline.fromNow(TIMEOUT);
final Configuration configuration = new Configuration();
configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
// having a dirty entry in the JobResultStore should make the ApplicationDispatcherBootstrap
// implementation fail to submit the job
final JobResultStore jobResultStore = new EmbeddedJobResultStore();
jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID)));
final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor()) {
@Override
public JobResultStore getJobResultStore() {
return jobResultStore;
}
};
final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), ErrorHandlingSubmissionJob.createPackagedProgram()));
try (final MiniCluster cluster = clusterBuilder.build()) {
// start mini cluster and submit the job
cluster.start();
// the cluster should shut down automatically once the application completes
awaitClusterStopped(cluster, deadline);
}
FlinkAssertions.assertThatChainOfCauses(ErrorHandlingSubmissionJob.getSubmissionException()).as("The job's main method shouldn't have been succeeded due to a DuplicateJobSubmissionException.").hasAtLeastOneElementOfType(DuplicateJobSubmissionException.class);
assertThat(jobResultStore.hasDirtyJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isFalse();
assertThat(jobResultStore.hasCleanJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isTrue();
}
use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.
the class JobMasterServiceLeadershipRunnerTest method testJobAlreadyDone.
@Test
public void testJobAlreadyDone() throws Exception {
final JobID jobId = new JobID();
final JobResult jobResult = TestingJobResultStore.createJobResult(jobId, ApplicationStatus.UNKNOWN);
jobResultStore.createDirtyResult(new JobResultEntry(jobResult));
try (JobManagerRunner jobManagerRunner = newJobMasterServiceLeadershipRunnerBuilder().setJobMasterServiceProcessFactory(TestingJobMasterServiceProcessFactory.newBuilder().setJobId(jobId).build()).build()) {
jobManagerRunner.start();
leaderElectionService.isLeader(UUID.randomUUID());
final CompletableFuture<JobManagerRunnerResult> resultFuture = jobManagerRunner.getResultFuture();
JobManagerRunnerResult result = resultFuture.get();
assertEquals(JobStatus.FAILED, result.getExecutionGraphInfo().getArchivedExecutionGraph().getState());
}
}
use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.
the class DispatcherTest method testDuplicateJobSubmissionWithGloballyTerminatedJobId.
@Test
public void testDuplicateJobSubmissionWithGloballyTerminatedJobId() throws Exception {
final JobResult jobResult = TestingJobResultStore.createJobResult(jobGraph.getJobID(), ApplicationStatus.SUCCEEDED);
haServices.getJobResultStore().createDirtyResult(new JobResultEntry(jobResult));
dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(jobId, createdJobManagerRunnerLatch));
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
final CompletableFuture<Acknowledge> submitFuture = dispatcherGateway.submitJob(jobGraph, TIMEOUT);
final ExecutionException executionException = assertThrows(ExecutionException.class, submitFuture::get);
assertTrue(executionException.getCause() instanceof DuplicateJobSubmissionException);
final DuplicateJobSubmissionException duplicateException = (DuplicateJobSubmissionException) executionException.getCause();
assertTrue(duplicateException.isGloballyTerminated());
}
Aggregations