Search in sources :

Example 1 with JobResultStore

use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.

the class DispatcherCleanupITCase method testCleanupNotCancellable.

@Test
public void testCleanupNotCancellable() throws Exception {
    final JobGraph jobGraph = createJobGraph();
    final JobID jobId = jobGraph.getJobID();
    final JobResultStore jobResultStore = new EmbeddedJobResultStore();
    jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(jobId)));
    haServices.setJobResultStore(jobResultStore);
    // Instantiates JobManagerRunner
    final CompletableFuture<Void> jobManagerRunnerCleanupFuture = new CompletableFuture<>();
    final AtomicReference<JobManagerRunner> jobManagerRunnerEntry = new AtomicReference<>();
    final JobManagerRunnerRegistry jobManagerRunnerRegistry = TestingJobManagerRunnerRegistry.newSingleJobBuilder(jobManagerRunnerEntry).withLocalCleanupAsyncFunction((actualJobId, executor) -> jobManagerRunnerCleanupFuture).build();
    final Dispatcher dispatcher = createTestingDispatcherBuilder().setJobManagerRunnerRegistry(jobManagerRunnerRegistry).build();
    dispatcher.start();
    toTerminate.add(dispatcher);
    CommonTestUtils.waitUntilCondition(() -> jobManagerRunnerEntry.get() != null, Deadline.fromNow(Duration.ofSeconds(10)), "JobManagerRunner wasn't loaded in time.");
    assertThat("The JobResultStore should have this job still marked as dirty.", haServices.getJobResultStore().hasDirtyJobResultEntry(jobId), CoreMatchers.is(true));
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    try {
        dispatcherGateway.cancelJob(jobId, TIMEOUT).get();
        Assert.fail("Should fail because cancelling the cleanup is not allowed.");
    } catch (ExecutionException e) {
        assertThat(e, FlinkMatchers.containsCause(JobCancellationFailedException.class));
    }
    jobManagerRunnerCleanupFuture.complete(null);
    CommonTestUtils.waitUntilCondition(() -> haServices.getJobResultStore().hasCleanJobResultEntry(jobId), Deadline.fromNow(Duration.ofSeconds(60)), "The JobResultStore should have this job marked as clean now.");
}
Also used : CoreMatchers(org.hamcrest.CoreMatchers) Deadline(org.apache.flink.api.common.time.Deadline) RpcEndpoint(org.apache.flink.runtime.rpc.RpcEndpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) IsEqual.equalTo(org.hamcrest.core.IsEqual.equalTo) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExceptionUtils(org.apache.flink.util.ExceptionUtils) IsEmptyCollection(org.hamcrest.collection.IsEmptyCollection) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) After(org.junit.After) Duration(java.time.Duration) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) DispatcherResourceCleanerFactory(org.apache.flink.runtime.dispatcher.cleanup.DispatcherResourceCleanerFactory) BlockingQueue(java.util.concurrent.BlockingQueue) UUID(java.util.UUID) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) TimeUtils(org.apache.flink.util.TimeUtils) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) Optional(java.util.Optional) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphStore(org.apache.flink.runtime.jobmanager.JobGraphStore) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) FlinkMatchers(org.apache.flink.core.testutils.FlinkMatchers) TestingJobGraphStore(org.apache.flink.runtime.testutils.TestingJobGraphStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) AtomicReference(java.util.concurrent.atomic.AtomicReference) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) JobResult(org.apache.flink.runtime.jobmaster.JobResult) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Before(org.junit.Before) JobGraphBuilder(org.apache.flink.runtime.jobgraph.JobGraphBuilder) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) ExecutionException(java.util.concurrent.ExecutionException) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) TestingRetryStrategies(org.apache.flink.runtime.dispatcher.cleanup.TestingRetryStrategies) ForkJoinPool(java.util.concurrent.ForkJoinPool) EmbeddedCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.EmbeddedCompletedCheckpointStore) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) Assert(org.junit.Assert) Collections(java.util.Collections) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) AtomicReference(java.util.concurrent.atomic.AtomicReference) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CompletableFuture(java.util.concurrent.CompletableFuture) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) ExecutionException(java.util.concurrent.ExecutionException) JobManagerRunner(org.apache.flink.runtime.jobmaster.JobManagerRunner) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 2 with JobResultStore

use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.

the class JobDispatcherLeaderProcessFactoryFactory method createFactory.

@Override
public JobDispatcherLeaderProcessFactory createFactory(JobPersistenceComponentFactory jobPersistenceComponentFactory, Executor ioExecutor, RpcService rpcService, PartialDispatcherServices partialDispatcherServices, FatalErrorHandler fatalErrorHandler) {
    final JobGraph jobGraph;
    try {
        jobGraph = Preconditions.checkNotNull(jobGraphRetriever.retrieveJobGraph(partialDispatcherServices.getConfiguration()));
    } catch (FlinkException e) {
        throw new FlinkRuntimeException("Could not retrieve the JobGraph.", e);
    }
    final JobResultStore jobResultStore = jobPersistenceComponentFactory.createJobResultStore();
    final Collection<JobResult> recoveredDirtyJobResults = getDirtyJobResults(jobResultStore);
    final Optional<JobResult> maybeRecoveredDirtyJobResult = extractDirtyJobResult(recoveredDirtyJobResults, jobGraph);
    final Optional<JobGraph> maybeJobGraph = getJobGraphBasedOnDirtyJobResults(jobGraph, recoveredDirtyJobResults);
    final DefaultDispatcherGatewayServiceFactory defaultDispatcherServiceFactory = new DefaultDispatcherGatewayServiceFactory(JobDispatcherFactory.INSTANCE, rpcService, partialDispatcherServices);
    return new JobDispatcherLeaderProcessFactory(defaultDispatcherServiceFactory, maybeJobGraph.orElse(null), maybeRecoveredDirtyJobResult.orElse(null), jobResultStore, fatalErrorHandler);
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobResult(org.apache.flink.runtime.jobmaster.JobResult) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) FlinkException(org.apache.flink.util.FlinkException)

Example 3 with JobResultStore

use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.

the class ApplicationDispatcherGatewayServiceFactory method create.

@Override
public AbstractDispatcherLeaderProcess.DispatcherGatewayService create(DispatcherId fencingToken, Collection<JobGraph> recoveredJobs, Collection<JobResult> recoveredDirtyJobResults, JobGraphWriter jobGraphWriter, JobResultStore jobResultStore) {
    final List<JobID> recoveredJobIds = getRecoveredJobIds(recoveredJobs);
    final Dispatcher dispatcher;
    try {
        dispatcher = dispatcherFactory.createDispatcher(rpcService, fencingToken, recoveredJobs, recoveredDirtyJobResults, (dispatcherGateway, scheduledExecutor, errorHandler) -> new ApplicationDispatcherBootstrap(application, recoveredJobIds, configuration, dispatcherGateway, scheduledExecutor, errorHandler), PartialDispatcherServicesWithJobPersistenceComponents.from(partialDispatcherServices, jobGraphWriter, jobResultStore));
    } catch (Exception e) {
        throw new FlinkRuntimeException("Could not create the Dispatcher rpc endpoint.", e);
    }
    dispatcher.start();
    return DefaultDispatcherGatewayService.from(dispatcher);
}
Also used : DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) PartialDispatcherServicesWithJobPersistenceComponents(org.apache.flink.runtime.dispatcher.PartialDispatcherServicesWithJobPersistenceComponents) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) AbstractDispatcherLeaderProcess(org.apache.flink.runtime.dispatcher.runner.AbstractDispatcherLeaderProcess) Collectors(java.util.stream.Collectors) JobResult(org.apache.flink.runtime.jobmaster.JobResult) List(java.util.List) JobID(org.apache.flink.api.common.JobID) RpcService(org.apache.flink.runtime.rpc.RpcService) Internal(org.apache.flink.annotation.Internal) PackagedProgram(org.apache.flink.client.program.PackagedProgram) DispatcherFactory(org.apache.flink.runtime.dispatcher.DispatcherFactory) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) JobGraphWriter(org.apache.flink.runtime.jobmanager.JobGraphWriter) DefaultDispatcherGatewayService(org.apache.flink.runtime.dispatcher.runner.DefaultDispatcherGatewayService) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) JobID(org.apache.flink.api.common.JobID) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException)

Example 4 with JobResultStore

use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.

the class ApplicationDispatcherBootstrapITCase method testDirtyJobResultRecoveryInApplicationMode.

@Test
public void testDirtyJobResultRecoveryInApplicationMode() throws Exception {
    final Deadline deadline = Deadline.fromNow(TIMEOUT);
    final Configuration configuration = new Configuration();
    configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
    configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
    configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
    final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
    // having a dirty entry in the JobResultStore should make the ApplicationDispatcherBootstrap
    // implementation fail to submit the job
    final JobResultStore jobResultStore = new EmbeddedJobResultStore();
    jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID)));
    final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor()) {

        @Override
        public JobResultStore getJobResultStore() {
            return jobResultStore;
        }
    };
    final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), ErrorHandlingSubmissionJob.createPackagedProgram()));
    try (final MiniCluster cluster = clusterBuilder.build()) {
        // start mini cluster and submit the job
        cluster.start();
        // the cluster should shut down automatically once the application completes
        awaitClusterStopped(cluster, deadline);
    }
    FlinkAssertions.assertThatChainOfCauses(ErrorHandlingSubmissionJob.getSubmissionException()).as("The job's main method shouldn't have been succeeded due to a DuplicateJobSubmissionException.").hasAtLeastOneElementOfType(DuplicateJobSubmissionException.class);
    assertThat(jobResultStore.hasDirtyJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isFalse();
    assertThat(jobResultStore.hasCleanJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isTrue();
}
Also used : TestingMiniCluster(org.apache.flink.runtime.minicluster.TestingMiniCluster) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingMiniClusterConfiguration(org.apache.flink.runtime.minicluster.TestingMiniClusterConfiguration) Deadline(org.apache.flink.api.common.time.Deadline) JobResultEntry(org.apache.flink.runtime.highavailability.JobResultEntry) EmbeddedHaServicesWithLeadershipControl(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl) MiniCluster(org.apache.flink.runtime.minicluster.MiniCluster) TestingMiniCluster(org.apache.flink.runtime.minicluster.TestingMiniCluster) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) TestingJobResultStore(org.apache.flink.runtime.testutils.TestingJobResultStore) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) EmbeddedJobResultStore(org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedJobResultStore) Test(org.junit.jupiter.api.Test)

Example 5 with JobResultStore

use of org.apache.flink.runtime.highavailability.JobResultStore in project flink by apache.

the class DefaultDispatcherGatewayServiceFactory method create.

@Override
public AbstractDispatcherLeaderProcess.DispatcherGatewayService create(DispatcherId fencingToken, Collection<JobGraph> recoveredJobs, Collection<JobResult> recoveredDirtyJobResults, JobGraphWriter jobGraphWriter, JobResultStore jobResultStore) {
    final Dispatcher dispatcher;
    try {
        dispatcher = dispatcherFactory.createDispatcher(rpcService, fencingToken, recoveredJobs, recoveredDirtyJobResults, (dispatcherGateway, scheduledExecutor, errorHandler) -> new NoOpDispatcherBootstrap(), PartialDispatcherServicesWithJobPersistenceComponents.from(partialDispatcherServices, jobGraphWriter, jobResultStore));
    } catch (Exception e) {
        throw new FlinkRuntimeException("Could not create the Dispatcher rpc endpoint.", e);
    }
    dispatcher.start();
    return DefaultDispatcherGatewayService.from(dispatcher);
}
Also used : DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) PartialDispatcherServicesWithJobPersistenceComponents(org.apache.flink.runtime.dispatcher.PartialDispatcherServicesWithJobPersistenceComponents) Collection(java.util.Collection) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobResult(org.apache.flink.runtime.jobmaster.JobResult) RpcService(org.apache.flink.runtime.rpc.RpcService) NoOpDispatcherBootstrap(org.apache.flink.runtime.dispatcher.NoOpDispatcherBootstrap) DispatcherFactory(org.apache.flink.runtime.dispatcher.DispatcherFactory) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphWriter(org.apache.flink.runtime.jobmanager.JobGraphWriter) NoOpDispatcherBootstrap(org.apache.flink.runtime.dispatcher.NoOpDispatcherBootstrap) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException)

Aggregations

JobResultStore (org.apache.flink.runtime.highavailability.JobResultStore)10 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)9 CompletableFuture (java.util.concurrent.CompletableFuture)6 JobID (org.apache.flink.api.common.JobID)6 Configuration (org.apache.flink.configuration.Configuration)6 Optional (java.util.Optional)5 ExecutionException (java.util.concurrent.ExecutionException)5 JobStatus (org.apache.flink.api.common.JobStatus)5 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)5 JobResultEntry (org.apache.flink.runtime.highavailability.JobResultEntry)5 TestingJobResultStore (org.apache.flink.runtime.testutils.TestingJobResultStore)5 ExceptionUtils (org.apache.flink.util.ExceptionUtils)5 After (org.junit.After)5 Before (org.junit.Before)5 Test (org.junit.Test)5 IOException (java.io.IOException)4 Deadline (org.apache.flink.api.common.time.Deadline)4 Time (org.apache.flink.api.common.time.Time)4 BlobServer (org.apache.flink.runtime.blob.BlobServer)4 BlobUtils (org.apache.flink.runtime.blob.BlobUtils)4