use of org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl in project flink by apache.
the class JobDispatcherITCase method testRecoverFromCheckpointAfterLosingAndRegainingLeadership.
@Test
public void testRecoverFromCheckpointAfterLosingAndRegainingLeadership(@TempDir Path tmpPath) throws Exception {
final Deadline deadline = Deadline.fromNow(TIMEOUT);
final Configuration configuration = new Configuration();
configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor());
final Configuration newConfiguration = new Configuration(clusterConfiguration.getConfiguration());
final long checkpointInterval = 100;
final JobID jobID = generateAndPersistJobGraph(newConfiguration, checkpointInterval, tmpPath);
final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createJobModeDispatcherResourceManagerComponentFactorySupplier(newConfiguration));
AtLeastOneCheckpointInvokable.reset();
try (final MiniCluster cluster = clusterBuilder.build()) {
// start mini cluster and submit the job
cluster.start();
AtLeastOneCheckpointInvokable.atLeastOneCheckpointCompleted.await();
final CompletableFuture<JobResult> firstJobResult = cluster.requestJobResult(jobID);
haServices.revokeDispatcherLeadership();
// make sure the leadership is revoked to avoid race conditions
Assertions.assertEquals(ApplicationStatus.UNKNOWN, firstJobResult.get().getApplicationStatus());
haServices.grantDispatcherLeadership();
// job is suspended, wait until it's running
awaitJobStatus(cluster, jobID, JobStatus.RUNNING, deadline);
CommonTestUtils.waitUntilCondition(() -> cluster.getArchivedExecutionGraph(jobID).get().getCheckpointStatsSnapshot().getLatestRestoredCheckpoint() != null, deadline);
}
}
use of org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl in project flink by apache.
the class ApplicationDispatcherBootstrapITCase method testDispatcherRecoversAfterLosingAndRegainingLeadership.
@Test
public void testDispatcherRecoversAfterLosingAndRegainingLeadership() throws Exception {
final String blockId = UUID.randomUUID().toString();
final Deadline deadline = Deadline.fromNow(TIMEOUT);
final Configuration configuration = new Configuration();
configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor());
final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), BlockingJob.getProgram(blockId)));
try (final MiniCluster cluster = clusterBuilder.build()) {
// start mini cluster and submit the job
cluster.start();
// wait until job is running
awaitJobStatus(cluster, ApplicationDispatcherBootstrap.ZERO_JOB_ID, JobStatus.RUNNING, deadline);
// make sure the operator is actually running
BlockingJob.awaitRunning(blockId);
final CompletableFuture<JobResult> firstJobResult = cluster.requestJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID);
haServices.revokeDispatcherLeadership();
// make sure the leadership is revoked to avoid race conditions
assertThat(firstJobResult.get()).extracting(JobResult::getApplicationStatus).isEqualTo(ApplicationStatus.UNKNOWN);
haServices.grantDispatcherLeadership();
// job is suspended, wait until it's running
awaitJobStatus(cluster, ApplicationDispatcherBootstrap.ZERO_JOB_ID, JobStatus.RUNNING, deadline);
// unblock processing so the job can finish
BlockingJob.unblock(blockId);
// and wait for it to actually finish
final JobResult secondJobResult = cluster.requestJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID).get();
assertThat(secondJobResult.isSuccess()).isTrue();
assertThat(secondJobResult.getApplicationStatus()).isEqualTo(ApplicationStatus.SUCCEEDED);
// the cluster should shut down automatically once the application completes
awaitClusterStopped(cluster, deadline);
} finally {
BlockingJob.cleanUp(blockId);
}
}
use of org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl in project flink by apache.
the class ApplicationDispatcherBootstrapITCase method testDirtyJobResultRecoveryInApplicationMode.
@Test
public void testDirtyJobResultRecoveryInApplicationMode() throws Exception {
final Deadline deadline = Deadline.fromNow(TIMEOUT);
final Configuration configuration = new Configuration();
configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
// having a dirty entry in the JobResultStore should make the ApplicationDispatcherBootstrap
// implementation fail to submit the job
final JobResultStore jobResultStore = new EmbeddedJobResultStore();
jobResultStore.createDirtyResult(new JobResultEntry(TestingJobResultStore.createSuccessfulJobResult(ApplicationDispatcherBootstrap.ZERO_JOB_ID)));
final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor()) {
@Override
public JobResultStore getJobResultStore() {
return jobResultStore;
}
};
final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), ErrorHandlingSubmissionJob.createPackagedProgram()));
try (final MiniCluster cluster = clusterBuilder.build()) {
// start mini cluster and submit the job
cluster.start();
// the cluster should shut down automatically once the application completes
awaitClusterStopped(cluster, deadline);
}
FlinkAssertions.assertThatChainOfCauses(ErrorHandlingSubmissionJob.getSubmissionException()).as("The job's main method shouldn't have been succeeded due to a DuplicateJobSubmissionException.").hasAtLeastOneElementOfType(DuplicateJobSubmissionException.class);
assertThat(jobResultStore.hasDirtyJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isFalse();
assertThat(jobResultStore.hasCleanJobResultEntry(ApplicationDispatcherBootstrap.ZERO_JOB_ID)).isTrue();
}
use of org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl in project flink by apache.
the class ApplicationDispatcherBootstrapITCase method testSubmitFailedJobOnApplicationError.
@Test
public void testSubmitFailedJobOnApplicationError() throws Exception {
final Deadline deadline = Deadline.fromNow(TIMEOUT);
final JobID jobId = new JobID();
final Configuration configuration = new Configuration();
configuration.set(HighAvailabilityOptions.HA_MODE, HighAvailabilityMode.ZOOKEEPER.name());
configuration.set(DeploymentOptions.TARGET, EmbeddedExecutor.NAME);
configuration.set(ClientOptions.CLIENT_RETRY_PERIOD, Duration.ofMillis(100));
configuration.set(DeploymentOptions.SHUTDOWN_ON_APPLICATION_FINISH, false);
configuration.set(DeploymentOptions.SUBMIT_FAILED_JOB_ON_APPLICATION_ERROR, true);
configuration.set(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, jobId.toHexString());
final TestingMiniClusterConfiguration clusterConfiguration = TestingMiniClusterConfiguration.newBuilder().setConfiguration(configuration).build();
final EmbeddedHaServicesWithLeadershipControl haServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor());
final TestingMiniCluster.Builder clusterBuilder = TestingMiniCluster.newBuilder(clusterConfiguration).setHighAvailabilityServicesSupplier(() -> haServices).setDispatcherResourceManagerComponentFactorySupplier(createApplicationModeDispatcherResourceManagerComponentFactorySupplier(clusterConfiguration.getConfiguration(), FailingJob.getProgram()));
try (final MiniCluster cluster = clusterBuilder.build()) {
// start mini cluster and submit the job
cluster.start();
// wait until the failed job has been submitted
awaitJobStatus(cluster, jobId, JobStatus.FAILED, deadline);
final ArchivedExecutionGraph graph = cluster.getArchivedExecutionGraph(jobId).get();
assertThat(graph.getJobID()).isEqualTo(jobId);
assertThat(graph.getJobName()).isEqualTo(ApplicationDispatcherBootstrap.FAILED_JOB_NAME);
assertThat(graph.getFailureInfo()).isNotNull().extracting(ErrorInfo::getException).extracting(e -> e.deserializeError(Thread.currentThread().getContextClassLoader())).satisfies(e -> assertThat(e).isInstanceOf(ProgramInvocationException.class).hasRootCauseInstanceOf(RuntimeException.class).hasRootCauseMessage(FailingJob.EXCEPTION_MESSAGE));
}
}
use of org.apache.flink.runtime.highavailability.nonha.embedded.EmbeddedHaServicesWithLeadershipControl in project flink by apache.
the class LeaderChangeClusterComponentsTest method setupClass.
@BeforeClass
public static void setupClass() throws Exception {
highAvailabilityServices = new EmbeddedHaServicesWithLeadershipControl(TestingUtils.defaultExecutor());
miniCluster = TestingMiniCluster.newBuilder(TestingMiniClusterConfiguration.newBuilder().setNumTaskManagers(NUM_TMS).setNumSlotsPerTaskManager(SLOTS_PER_TM).build()).setHighAvailabilityServicesSupplier(() -> highAvailabilityServices).build();
miniCluster.start();
}
Aggregations