Search in sources :

Example 1 with JobResultEntry

use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.

the class DispatcherCleanupITCase method testCleanupNotCancellable.

@Test
public void testCleanupNotCancellable() throws Exception {
    final JobGraph jobGraph = createJobGraph();
    final JobID jobId = jobGraph.getJobID();
    final JobResultStore jobResultStore = new EmbeddedJobResultStore();
    jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(jobId)));
    haServices.setJobResultStore(jobResultStore);
    // Instantiates JobManagerRunner
    final CompletableFuture<Void> jobManagerRunnerCleanupFuture = new CompletableFuture<>();
    final AtomicReference<JobManagerRunner> jobManagerRunnerEntry = new AtomicReference<>();
    final JobManagerRunnerRegistry jobManagerRunnerRegistry = TestingJobManagerRunnerRegistry.newSingleJobBuilder(jobManagerRunnerEntry).withLocalCleanupAsyncFunction((actualJobId, executor) -> jobManagerRunnerCleanupFuture).build();
    final Dispatcher dispatcher = createTestingDispatcherBuilder().setJobManagerRunnerRegistry(jobManagerRunnerRegistry).build();
    dispatcher.start();
    toTerminate.add(dispatcher);
    CommonTestUtils.waitUntilCondition(() -> jobManagerRunnerEntry.get() != null, Deadline.fromNow(Duration.ofSeconds(10)), "JobManagerRunner wasn't loaded in time.");
    assertThat("The JobResultStore should have this job still marked as dirty.", haServices.getJobResultStore().hasDirtyJobResultEntry(jobId), CoreMatchers.is(true));
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    try {
        dispatcherGateway.cancelJob(jobId, TIMEOUT).get();
        Assert.fail("Should fail because cancelling the cleanup is not allowed.");
    } catch (ExecutionException e) {
        assertThat(e, FlinkMatchers.containsCause(JobCancellationFailedException.class));
    }
    jobManagerRunnerCleanupFuture.complete(null);
    CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasCleanJobResultEntry(jobId), Deadline.fromNow(Duration.ofSeconds(60)), "The JobResultStore should have this job marked as clean now.");
}
Also used : CoreMatchers(org.hamcrest.CoreMatchers) Deadline(org.apache.flink.api.common.time.Deadline) RpcEndpoint(org.apache.flink.runtime.rpc.RpcEndpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) IsEqual.equalTo(org.hamcrest.core.IsEqual.equalTo) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExceptionUtils(org.apache.flink.util.ExceptionUtils) IsEmptyCollection(org.hamcrest.collection.IsEmptyCollection) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) After(org.junit.After) Duration(java.time.Duration) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) DispatcherResourceCleanerFactory(org.apache.flink.runtime.dispatcher.cleanup.DispatcherResourceCleanerFactory) BlockingQueue(java.util.concurrent.BlockingQueue) UUID(java.util.UUID) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) TimeUtils(org.apache.flink.util.TimeUtils) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) Optional(java.util.Optional) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) AtomicReference(java.util.concurrent.atomic.AtomicReference) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Before(org.junit.Before) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionException(java.util.concurrent.ExecutionException) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) TestingRetryStrategies(org.apache.flink.runtime.dispatcher.cleanup.TestingRetryStrategies) ForkJoinPool(java.util.concurrent.ForkJoinPool) EmbeddedCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.EmbeddedCompletedCheckpointStore) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Assert(org.junit.Assert) Collections(java.util.Collections) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) AtomicReference(java.util.concurrent.atomic.AtomicReference) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CompletableFuture(java.util.concurrent.CompletableFuture) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) ExecutionException(java.util.concurrent.ExecutionException) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 2 with JobResultEntry

use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.

the class Dispatcher method jobReachedTerminalState.

protected CleanupJobState jobReachedTerminalState(ExecutionGraphInfo executionGraphInfo) {
    final ArchivedExecutionGraph archivedExecutionGraph = executionGraphInfo.getArchivedExecutionGraph();
    final JobStatus terminalJobStatus = archivedExecutionGraph.getState();
    Preconditions.checkArgument(terminalJobStatus.isTerminalState(), "Job %s is in state %s which is not terminal.", archivedExecutionGraph.getJobID(), terminalJobStatus);
    // the failureInfo contains the reason for why job was failed/suspended, but for
    // finished/canceled jobs it may contain the last cause of a restart (if there were any)
    // for finished/canceled jobs we don't want to print it because it is misleading
    final boolean isFailureInfoRelatedToJobTermination = terminalJobStatus == JobStatus.SUSPENDED || terminalJobStatus == JobStatus.FAILED;
    if (archivedExecutionGraph.getFailureInfo() != null && isFailureInfoRelatedToJobTermination) {
        log.info("Job {} reached terminal state {}.\n{}", archivedExecutionGraph.getJobID(), terminalJobStatus, archivedExecutionGraph.getFailureInfo().getExceptionAsString().trim());
    } else {
        log.info("Job {} reached terminal state {}.", archivedExecutionGraph.getJobID(), terminalJobStatus);
    }
    archiveExecutionGraph(executionGraphInfo);
    if (terminalJobStatus.isGloballyTerminalState()) {
        final JobID jobId = executionGraphInfo.getJobId();
        try {
            if (jobResultStore.hasCleanJobResultEntry(jobId)) {
                log.warn("Job {} is already marked as clean but clean up was triggered again.", jobId);
            } else if (!jobResultStore.hasDirtyJobResultEntry(jobId)) {
                jobResultStore.createDirtyResult(new JobResultEntry(JobResult.createFrom(executionGraphInfo.getArchivedExecutionGraph())));
                log.info("Job {} has been registered for cleanup in the JobResultStore after reaching a terminal state.", jobId);
            }
        } catch (IOException e) {
            fatalErrorHandler.onFatalError(new FlinkException(String.format("The job %s couldn't be marked as pre-cleanup finished in JobResultStore.", jobId), e));
        }
    }
    return terminalJobStatus.isGloballyTerminalState() ? CleanupJobState.GLOBAL : CleanupJobState.LOCAL;
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) IOException(java.io.IOException) JobID(org.apache.flink.api.common.JobID) FlinkException(org.apache.flink.util.FlinkException)

Example 3 with JobResultEntry

use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.

the class ApplicationDispatcherBootstrapITCase method testDirtyJobResultRecoveryInApplicationMode.

@Test
public void testDirtyJobResultRecoveryInApplicationMode() throws Exception {
    final Deadline deadline = Deadline.fromNow(TIMEOUT);
    final Configuration configuration = new Configuration();
    configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
    configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
    configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
    final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
    // having a dirty entry in the JobResultStore should make the ApplicationDispatcherBootstrap
    // implementation fail to submit the job
    final JobResultStore jobResultStore = new EmbeddedJobResultStore();
    jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID)));
    final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor()) {

        @Override
        public JobResultStore getJobResultStore() {
            return jobResultStore;
        }
    };
    final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), ErrorHandlingSubmissionJob.createPackagedProgram()));
    try (final MiniCluster cluster = clusterBuilder.build()) {
        // start mini cluster and submit the job
        cluster.start();
        // the cluster should shut down automatically once the application completes
        awaitClusterStopped(cluster, deadline);
    }
    FlinkAssertions.assertThatChainOfCauses(ErrorHandlingSubmissionJob.getSubmissionException()).as("The job's main method shouldn't have been succeeded due to a DuplicateJobSubmissionException.").hasAtLeastOneElementOfType(DuplicateJobSubmissionException.class);
    assertThat(jobResultStore.hasDirtyJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isFalse();
    assertThat(jobResultStore.hasCleanJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isTrue();
}
Also used : TestingMiniCluster(org.apache.flink.runtime.minicluster.TestingMiniCluster) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) Deadline(org.apache.flink.api.common.time.Deadline) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) EmbeddedHaServicesWithLeadershipControl(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl) MiniCluster(org.apache.flink.runtime.minicluster.MiniCluster) TestingMiniCluster(org.apache.flink.runtime.minicluster.TestingMiniCluster) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) Test(org.junit.jupiter.api.Test)

Example 4 with JobResultEntry

use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.

the class JobMasterServiceLeadershipRunnerTest method testJobAlreadyDone.

@Test
public void testJobAlreadyDone() throws Exception {
    final JobID jobId = new JobID();
    final JobResult jobResult = TestingJobResultStore.createJobResult(jobId, ApplicationStatus.UNKNOWN);
    jobResultStore.createDirtyResult(new JobResultEntry(jobResult));
    try (JobManagerRunner jobManagerRunner = newJobMasterServiceLeadershipRunnerBuilder().setJobMasterServiceProcessFactory(TestingJobMasterServiceProcessFactory.newBuilder().setJobId(jobId).build()).build()) {
        jobManagerRunner.start();
        leaderElectionService.isLeader(UUID.randomUUID());
        final CompletableFuture<JobManagerRunnerResult> resultFuture = jobManagerRunner.getResultFuture();
        JobManagerRunnerResult result = resultFuture.get();
        assertEquals(JobStatus.FAILED, result.getExecutionGraphInfo().getArchivedExecutionGraph().getState());
    }
}
Also used : JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 5 with JobResultEntry

use of org.apache.flink.runtime.highavailability.JobResultEntry in project flink by apache.

the class DispatcherTest method testDuplicateJobSubmissionWithGloballyTerminatedJobId.

@Test
public void testDuplicateJobSubmissionWithGloballyTerminatedJobId() throws Exception {
    final JobResult jobResult = TestingJobResultStore.createJobResult(jobGraph.getJobID(), ApplicationStatus.SUCCEEDED);
    haServices.getJobResultStore().createDirtyResult(new JobResultEntry(jobResult));
    dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(jobId, createdJobManagerRunnerLatch));
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    final CompletableFuture<Acknowledge> submitFuture = dispatcherGateway.submitJob(jobGraph, TIMEOUT);
    final ExecutionException executionException = assertThrows(ExecutionException.class, submitFuture::get);
    assertTrue(executionException.getCause() instanceof DuplicateJobSubmissionException);
    final DuplicateJobSubmissionException duplicateException = (DuplicateJobSubmissionException) executionException.getCause();
    assertTrue(duplicateException.isGloballyTerminated());
}
Also used : JobResult(org.apache.flink.runtime.jobmaster.JobResult) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) ExecutionException(java.util.concurrent.ExecutionException) DuplicateJobSubmissionException(org.apache.flink.runtime.client.DuplicateJobSubmissionException) Test(org.junit.Test)

Aggregations

JobResultEntry (org.apache.flink.runtime.highavailability.JobResultEntry)6 JobID (org.apache.flink.api.common.JobID)4 ExecutionException (java.util.concurrent.ExecutionException)3 JobStatus (org.apache.flink.api.common.JobStatus)3 JobResultStore (org.apache.flink.runtime.highavailability.JobResultStore)3 Test (org.junit.Test)3 IOException (java.io.IOException)2 Optional (java.util.Optional)2 CompletableFuture (java.util.concurrent.CompletableFuture)2 Deadline (org.apache.flink.api.common.time.Deadline)2 Configuration (org.apache.flink.configuration.Configuration)2 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)2 DuplicateJobSubmissionException (org.apache.flink.runtime.client.DuplicateJobSubmissionException)2 ArchivedExecutionGraph (org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph)2 EmbeddedJobResultStore (org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 JobManagerRunner (org.apache.flink.runtime.jobmaster.JobManagerRunner)2 JobResult (org.apache.flink.runtime.jobmaster.JobResult)2 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)2 TestingJobResultStore (org.apache.flink.runtime.testutils.TestingJobResultStore)2