use of org.apache.flink.runtime.checkpoint.CheckpointsCleaner in project flink by apache.
the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointFails.
@Test
public void testRestoringModifiedJobFromSavepointFails() throws Exception {
final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(false, 42L);
final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
try {
executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, new StandaloneCompletedCheckpointStore(1), new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
}, log);
fail("Expected ExecutionGraph creation to fail because of non restored state.");
} catch (Exception e) {
assertThat(e, FlinkMatchers.containsMessage("Failed to rollback to checkpoint/savepoint"));
}
}
use of org.apache.flink.runtime.checkpoint.CheckpointsCleaner in project flink by apache.
the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds.
@Test
public void testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds() throws Exception {
// create savepoint data
final long savepointId = 42L;
final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(true, savepointId);
final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, completedCheckpointStore, new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
}, log);
final CompletedCheckpoint savepoint = completedCheckpointStore.getLatestCheckpoint();
MatcherAssert.assertThat(savepoint, notNullValue());
MatcherAssert.assertThat(savepoint.getCheckpointID(), Matchers.is(savepointId));
}
use of org.apache.flink.runtime.checkpoint.CheckpointsCleaner in project flink by apache.
the class DefaultSchedulerFactory method createInstance.
@Override
public SchedulerNG createInstance(final Logger log, final JobGraph jobGraph, final Executor ioExecutor, final Configuration jobMasterConfiguration, final SlotPoolService slotPoolService, final ScheduledExecutorService futureExecutor, final ClassLoader userCodeLoader, final CheckpointRecoveryFactory checkpointRecoveryFactory, final Time rpcTimeout, final BlobWriter blobWriter, final JobManagerJobMetricGroup jobManagerJobMetricGroup, final Time slotRequestTimeout, final ShuffleMaster<?> shuffleMaster, final JobMasterPartitionTracker partitionTracker, final ExecutionDeploymentTracker executionDeploymentTracker, long initializationTimestamp, final ComponentMainThreadExecutor mainThreadExecutor, final FatalErrorHandler fatalErrorHandler, final JobStatusListener jobStatusListener) throws Exception {
final SlotPool slotPool = slotPoolService.castInto(SlotPool.class).orElseThrow(() -> new IllegalStateException("The DefaultScheduler requires a SlotPool."));
final DefaultSchedulerComponents schedulerComponents = createSchedulerComponents(jobGraph.getJobType(), jobGraph.isApproximateLocalRecoveryEnabled(), jobMasterConfiguration, slotPool, slotRequestTimeout);
final RestartBackoffTimeStrategy restartBackoffTimeStrategy = RestartBackoffTimeStrategyFactoryLoader.createRestartBackoffTimeStrategyFactory(jobGraph.getSerializedExecutionConfig().deserializeValue(userCodeLoader).getRestartStrategy(), jobMasterConfiguration, jobGraph.isCheckpointingEnabled()).create();
log.info("Using restart back off time strategy {} for {} ({}).", restartBackoffTimeStrategy, jobGraph.getName(), jobGraph.getJobID());
final ExecutionGraphFactory executionGraphFactory = new DefaultExecutionGraphFactory(jobMasterConfiguration, userCodeLoader, executionDeploymentTracker, futureExecutor, ioExecutor, rpcTimeout, jobManagerJobMetricGroup, blobWriter, shuffleMaster, partitionTracker);
return new DefaultScheduler(log, jobGraph, ioExecutor, jobMasterConfiguration, schedulerComponents.getStartUpAction(), new ScheduledExecutorServiceAdapter(futureExecutor), userCodeLoader, new CheckpointsCleaner(), checkpointRecoveryFactory, jobManagerJobMetricGroup, schedulerComponents.getSchedulingStrategyFactory(), FailoverStrategyFactoryLoader.loadFailoverStrategyFactory(jobMasterConfiguration), restartBackoffTimeStrategy, new DefaultExecutionVertexOperations(), new ExecutionVertexVersioner(), schedulerComponents.getAllocatorFactory(), initializationTimestamp, mainThreadExecutor, (jobId, jobStatus, timestamp) -> {
if (jobStatus == JobStatus.RESTARTING) {
slotPool.setIsJobRestarting(true);
} else {
slotPool.setIsJobRestarting(false);
}
jobStatusListener.jobStatusChanges(jobId, jobStatus, timestamp);
}, executionGraphFactory, shuffleMaster, rpcTimeout);
}
use of org.apache.flink.runtime.checkpoint.CheckpointsCleaner in project flink by apache.
the class JobMasterTest method testCheckpointPrecedesSavepointRecovery.
/**
* Tests that an existing checkpoint will have precedence over an savepoint.
*/
@Test
public void testCheckpointPrecedesSavepointRecovery() throws Exception {
// create savepoint data
final long savepointId = 42L;
final File savepointFile = createSavepoint(savepointId);
// set savepoint settings
final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath("" + savepointFile.getAbsolutePath(), true);
final JobGraph jobGraph = createJobGraphWithCheckpointing(savepointRestoreSettings);
final long checkpointId = 1L;
final CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(jobGraph.getJobID(), checkpointId, 1L, 1L, Collections.emptyMap(), null, CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new DummyCheckpointStorageLocation());
final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
});
final CheckpointRecoveryFactory testingCheckpointRecoveryFactory = PerJobCheckpointRecoveryFactory.withoutCheckpointStoreRecovery(maxCheckpoints -> completedCheckpointStore);
haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).createJobMaster();
try {
// starting the JobMaster should have read the savepoint
final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();
assertThat(savepointCheckpoint, Matchers.notNullValue());
assertThat(savepointCheckpoint.getCheckpointID(), is(checkpointId));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.checkpoint.CheckpointsCleaner in project flink by apache.
the class AdaptiveSchedulerTest method runExceptionHistoryTests.
private Iterable<RootExceptionHistoryEntry> runExceptionHistoryTests(BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic, Consumer<AdaptiveSchedulerBuilder> setupScheduler, Consumer<JobGraph> setupJobGraph) throws Exception {
final int numAvailableSlots = 4;
final JobGraph jobGraph = createJobGraph();
setupJobGraph.accept(jobGraph);
RunFailedJobListener listener = new RunFailedJobListener();
List<ExecutionAttemptID> cancelledTasks = new ArrayList<>();
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
final DefaultDeclarativeSlotPool declarativeSlotPool = createDeclarativeSlotPool(jobGraph.getJobID());
final Configuration configuration = new Configuration();
configuration.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(1L));
AdaptiveSchedulerBuilder builder = new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setDeclarativeSlotPool(declarativeSlotPool).setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner).setJobStatusListener(listener);
setupScheduler.accept(builder);
final AdaptiveScheduler scheduler = builder.build();
final SubmissionBufferingTaskManagerGateway taskManagerGateway = new SubmissionBufferingTaskManagerGateway(numAvailableSlots);
taskManagerGateway.setCancelConsumer(cancelledTasks::add);
singleThreadMainThreadExecutor.execute(() -> {
scheduler.startScheduling();
offerSlots(declarativeSlotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, numAvailableSlots)), taskManagerGateway);
});
listener.waitForRunning();
CompletableFuture<Iterable<ArchivedExecutionVertex>> vertexFuture = new CompletableFuture<>();
singleThreadMainThreadExecutor.execute(() -> vertexFuture.complete(scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices()));
final Iterable<ArchivedExecutionVertex> executionVertices = vertexFuture.get();
final List<ExecutionAttemptID> attemptIds = IterableUtils.toStream(executionVertices).map(ArchivedExecutionVertex::getCurrentExecutionAttempt).map(ArchivedExecution::getAttemptId).collect(Collectors.toList());
CompletableFuture<Void> runTestLogicFuture = CompletableFuture.runAsync(() -> testLogic.accept(scheduler, attemptIds), singleThreadMainThreadExecutor);
runTestLogicFuture.get();
Consumer<ExecutionAttemptID> canceller = attemptId -> scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.CANCELED, null)));
CompletableFuture<Void> cancelFuture = CompletableFuture.runAsync(() -> cancelledTasks.forEach(canceller), singleThreadMainThreadExecutor);
cancelFuture.get();
listener.waitForTerminal();
return scheduler.requestJob().getExceptionHistory();
}
Aggregations