use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class CheckpointResourcesCleanupRunner method cleanupCheckpoints.
private void cleanupCheckpoints() throws Exception {
final CompletedCheckpointStore completedCheckpointStore = createCompletedCheckpointStore();
final CheckpointIDCounter checkpointIDCounter = createCheckpointIDCounter();
Exception exception = null;
try {
completedCheckpointStore.shutdown(getJobStatus(), checkpointsCleaner);
} catch (Exception e) {
exception = e;
}
try {
checkpointIDCounter.shutdown(getJobStatus());
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
if (exception != null) {
throw exception;
}
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class ZooKeeperUtils method createCompletedCheckpoints.
/**
* Creates a {@link DefaultCompletedCheckpointStore} instance with {@link
* ZooKeeperStateHandleStore}.
*
* @param client The {@link CuratorFramework} ZooKeeper client to use
* @param configuration {@link Configuration} object
* @param maxNumberOfCheckpointsToRetain The maximum number of checkpoints to retain
* @param executor to run ZooKeeper callbacks
* @return {@link DefaultCompletedCheckpointStore} instance
* @throws Exception if the completed checkpoint store cannot be created
*/
public static CompletedCheckpointStore createCompletedCheckpoints(CuratorFramework client, Configuration configuration, int maxNumberOfCheckpointsToRetain, SharedStateRegistryFactory sharedStateRegistryFactory, Executor ioExecutor, Executor executor) throws Exception {
checkNotNull(configuration, "Configuration");
RetrievableStateStorageHelper<CompletedCheckpoint> stateStorage = createFileSystemStateStorage(configuration, HA_STORAGE_COMPLETED_CHECKPOINT);
final ZooKeeperStateHandleStore<CompletedCheckpoint> completedCheckpointStateHandleStore = createZooKeeperStateHandleStore(client, getCheckpointsPath(), stateStorage);
Collection<CompletedCheckpoint> completedCheckpoints = DefaultCompletedCheckpointStoreUtils.retrieveCompletedCheckpoints(completedCheckpointStateHandleStore, ZooKeeperCheckpointStoreUtil.INSTANCE);
final CompletedCheckpointStore zooKeeperCompletedCheckpointStore = new DefaultCompletedCheckpointStore<>(maxNumberOfCheckpointsToRetain, completedCheckpointStateHandleStore, ZooKeeperCheckpointStoreUtil.INSTANCE, completedCheckpoints, sharedStateRegistryFactory.create(ioExecutor, completedCheckpoints), executor);
LOG.info("Initialized {} in '{}' with {}.", DefaultCompletedCheckpointStore.class.getSimpleName(), completedCheckpointStateHandleStore, getCheckpointsPath());
return zooKeeperCompletedCheckpointStore;
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class SchedulerUtilsTest method testSettingMaxNumberOfCheckpointsToRetain.
@Test
public void testSettingMaxNumberOfCheckpointsToRetain() throws Exception {
final int maxNumberOfCheckpointsToRetain = 10;
final Configuration jobManagerConfig = new Configuration();
jobManagerConfig.setInteger(CheckpointingOptions.MAX_RETAINED_CHECKPOINTS, maxNumberOfCheckpointsToRetain);
final CompletedCheckpointStore completedCheckpointStore = SchedulerUtils.createCompletedCheckpointStore(jobManagerConfig, new StandaloneCheckpointRecoveryFactory(), Executors.directExecutor(), log, new JobID());
assertEquals(maxNumberOfCheckpointsToRetain, completedCheckpointStore.getMaxNumberOfRetainedCheckpoints());
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class AdaptiveSchedulerTest method testCloseShutsDownCheckpointingComponents.
@Test
public void testCloseShutsDownCheckpointingComponents() throws Exception {
final CompletableFuture<JobStatus> completedCheckpointStoreShutdownFuture = new CompletableFuture<>();
final CompletedCheckpointStore completedCheckpointStore = TestingCompletedCheckpointStore.createStoreWithShutdownCheckAndNoCompletedCheckpoints(completedCheckpointStoreShutdownFuture);
final CompletableFuture<JobStatus> checkpointIdCounterShutdownFuture = new CompletableFuture<>();
final CheckpointIDCounter checkpointIdCounter = TestingCheckpointIDCounter.createStoreWithShutdownCheckAndNoStartAction(checkpointIdCounterShutdownFuture);
final JobGraph jobGraph = createJobGraph();
// checkpointing components are only created if checkpointing is enabled
jobGraph.setSnapshotSettings(new JobCheckpointingSettings(CheckpointCoordinatorConfiguration.builder().build(), null));
final AdaptiveScheduler scheduler = new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setCheckpointRecoveryFactory(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIdCounter)).build();
singleThreadMainThreadExecutor.execute(() -> {
scheduler.startScheduling();
// transition into the FAILED state
scheduler.handleGlobalFailure(new FlinkException("Test exception"));
scheduler.closeAsync();
});
assertThat(completedCheckpointStoreShutdownFuture.get()).isEqualTo(JobStatus.FAILED);
assertThat(checkpointIdCounterShutdownFuture.get()).isEqualTo(JobStatus.FAILED);
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class AdaptiveSchedulerTest method runExceptionHistoryTests.
private Iterable<RootExceptionHistoryEntry> runExceptionHistoryTests(BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic, Consumer<AdaptiveSchedulerBuilder> setupScheduler, Consumer<JobGraph> setupJobGraph) throws Exception {
final int numAvailableSlots = 4;
final JobGraph jobGraph = createJobGraph();
setupJobGraph.accept(jobGraph);
RunFailedJobListener listener = new RunFailedJobListener();
List<ExecutionAttemptID> cancelledTasks = new ArrayList<>();
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
final DefaultDeclarativeSlotPool declarativeSlotPool = createDeclarativeSlotPool(jobGraph.getJobID());
final Configuration configuration = new Configuration();
configuration.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(1L));
AdaptiveSchedulerBuilder builder = new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setDeclarativeSlotPool(declarativeSlotPool).setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner).setJobStatusListener(listener);
setupScheduler.accept(builder);
final AdaptiveScheduler scheduler = builder.build();
final SubmissionBufferingTaskManagerGateway taskManagerGateway = new SubmissionBufferingTaskManagerGateway(numAvailableSlots);
taskManagerGateway.setCancelConsumer(cancelledTasks::add);
singleThreadMainThreadExecutor.execute(() -> {
scheduler.startScheduling();
offerSlots(declarativeSlotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, numAvailableSlots)), taskManagerGateway);
});
listener.waitForRunning();
CompletableFuture<Iterable<ArchivedExecutionVertex>> vertexFuture = new CompletableFuture<>();
singleThreadMainThreadExecutor.execute(() -> vertexFuture.complete(scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices()));
final Iterable<ArchivedExecutionVertex> executionVertices = vertexFuture.get();
final List<ExecutionAttemptID> attemptIds = IterableUtils.toStream(executionVertices).map(ArchivedExecutionVertex::getCurrentExecutionAttempt).map(ArchivedExecution::getAttemptId).collect(Collectors.toList());
CompletableFuture<Void> runTestLogicFuture = CompletableFuture.runAsync(() -> testLogic.accept(scheduler, attemptIds), singleThreadMainThreadExecutor);
runTestLogicFuture.get();
Consumer<ExecutionAttemptID> canceller = attemptId -> scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.CANCELED, null)));
CompletableFuture<Void> cancelFuture = CompletableFuture.runAsync(() -> cancelledTasks.forEach(canceller), singleThreadMainThreadExecutor);
cancelFuture.get();
listener.waitForTerminal();
return scheduler.requestJob().getExceptionHistory();
}
Aggregations