use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class ExecutionGraphBuilder method buildGraph.
/**
* Builds the ExecutionGraph from the JobGraph.
* If a prior execution graph exists, the JobGraph will be attached. If no prior execution
* graph exists, then the JobGraph will become attach to a new empty execution graph.
*/
public static ExecutionGraph buildGraph(@Nullable ExecutionGraph prior, JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, SlotProvider slotProvider, ClassLoader classLoader, CheckpointRecoveryFactory recoveryFactory, Time timeout, RestartStrategy restartStrategy, MetricGroup metrics, int parallelismForAutoMax, Logger log) throws JobExecutionException, JobException {
checkNotNull(jobGraph, "job graph cannot be null");
final String jobName = jobGraph.getName();
final JobID jobId = jobGraph.getJobID();
// create a new execution graph, if none exists so far
final ExecutionGraph executionGraph;
try {
executionGraph = (prior != null) ? prior : new ExecutionGraph(futureExecutor, ioExecutor, jobId, jobName, jobGraph.getJobConfiguration(), jobGraph.getSerializedExecutionConfig(), timeout, restartStrategy, jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths(), slotProvider, classLoader, metrics);
} catch (IOException e) {
throw new JobException("Could not create the execution graph.", e);
}
// set the basic properties
executionGraph.setScheduleMode(jobGraph.getScheduleMode());
executionGraph.setQueuedSchedulingAllowed(jobGraph.getAllowQueuedScheduling());
try {
executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
} catch (Throwable t) {
log.warn("Cannot create JSON plan for job", t);
// give the graph an empty plan
executionGraph.setJsonPlan("{}");
}
// initialize the vertices that have a master initialization hook
// file output formats create directories here, input formats create splits
final long initMasterStart = System.nanoTime();
log.info("Running initialization on master for job {} ({}).", jobName, jobId);
for (JobVertex vertex : jobGraph.getVertices()) {
String executableClass = vertex.getInvokableClassName();
if (executableClass == null || executableClass.isEmpty()) {
throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
}
if (vertex.getParallelism() == ExecutionConfig.PARALLELISM_AUTO_MAX) {
vertex.setParallelism(parallelismForAutoMax);
}
try {
vertex.initializeOnMaster(classLoader);
} catch (Throwable t) {
throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
}
}
log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
// topologically sort the job vertices and attach the graph to the existing one
List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
if (log.isDebugEnabled()) {
log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
}
executionGraph.attachJobGraph(sortedTopology);
if (log.isDebugEnabled()) {
log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
}
// configure the state checkpointing
JobSnapshottingSettings snapshotSettings = jobGraph.getSnapshotSettings();
if (snapshotSettings != null) {
List<ExecutionJobVertex> triggerVertices = idToVertex(snapshotSettings.getVerticesToTrigger(), executionGraph);
List<ExecutionJobVertex> ackVertices = idToVertex(snapshotSettings.getVerticesToAcknowledge(), executionGraph);
List<ExecutionJobVertex> confirmVertices = idToVertex(snapshotSettings.getVerticesToConfirm(), executionGraph);
CompletedCheckpointStore completedCheckpoints;
CheckpointIDCounter checkpointIdCounter;
try {
int maxNumberOfCheckpointsToRetain = jobManagerConfig.getInteger(CoreOptions.MAX_RETAINED_CHECKPOINTS);
if (maxNumberOfCheckpointsToRetain <= 0) {
// warning and use 1 as the default value if the setting in
// state.checkpoints.max-retained-checkpoints is not greater than 0.
log.warn("The setting for '{} : {}' is invalid. Using default value of {}", CoreOptions.MAX_RETAINED_CHECKPOINTS.key(), maxNumberOfCheckpointsToRetain, CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue());
maxNumberOfCheckpointsToRetain = CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue();
}
completedCheckpoints = recoveryFactory.createCheckpointStore(jobId, maxNumberOfCheckpointsToRetain, classLoader);
checkpointIdCounter = recoveryFactory.createCheckpointIDCounter(jobId);
} catch (Exception e) {
throw new JobExecutionException(jobId, "Failed to initialize high-availability checkpoint handler", e);
}
// Maximum number of remembered checkpoints
int historySize = jobManagerConfig.getInteger(ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE);
CheckpointStatsTracker checkpointStatsTracker = new CheckpointStatsTracker(historySize, ackVertices, snapshotSettings, metrics);
// The default directory for externalized checkpoints
String externalizedCheckpointsDir = jobManagerConfig.getString(ConfigConstants.CHECKPOINTS_DIRECTORY_KEY, null);
// load the state backend for checkpoint metadata.
// if specified in the application, use from there, otherwise load from configuration
final StateBackend metadataBackend;
final StateBackend applicationConfiguredBackend = snapshotSettings.getDefaultStateBackend();
if (applicationConfiguredBackend != null) {
metadataBackend = applicationConfiguredBackend;
log.info("Using application-defined state backend for checkpoint/savepoint metadata: {}.", applicationConfiguredBackend);
} else {
try {
metadataBackend = AbstractStateBackend.loadStateBackendFromConfigOrCreateDefault(jobManagerConfig, classLoader, log);
} catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
}
}
executionGraph.enableCheckpointing(snapshotSettings.getCheckpointInterval(), snapshotSettings.getCheckpointTimeout(), snapshotSettings.getMinPauseBetweenCheckpoints(), snapshotSettings.getMaxConcurrentCheckpoints(), snapshotSettings.getExternalizedCheckpointSettings(), triggerVertices, ackVertices, confirmVertices, checkpointIdCounter, completedCheckpoints, externalizedCheckpointsDir, metadataBackend, checkpointStatsTracker);
}
return executionGraph;
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class DefaultExecutionGraphFactory method createAndRestoreExecutionGraph.
@Override
public ExecutionGraph createAndRestoreExecutionGraph(JobGraph jobGraph, CompletedCheckpointStore completedCheckpointStore, CheckpointsCleaner checkpointsCleaner, CheckpointIDCounter checkpointIdCounter, TaskDeploymentDescriptorFactory.PartitionLocationConstraint partitionLocationConstraint, long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, ExecutionStateUpdateListener executionStateUpdateListener, Logger log) throws Exception {
ExecutionDeploymentListener executionDeploymentListener = new ExecutionDeploymentTrackerDeploymentListenerAdapter(executionDeploymentTracker);
ExecutionStateUpdateListener combinedExecutionStateUpdateListener = (execution, previousState, newState) -> {
executionStateUpdateListener.onStateUpdate(execution, previousState, newState);
if (newState.isTerminal()) {
executionDeploymentTracker.stopTrackingDeploymentOf(execution);
}
};
final ExecutionGraph newExecutionGraph = DefaultExecutionGraphBuilder.buildGraph(jobGraph, configuration, futureExecutor, ioExecutor, userCodeClassLoader, completedCheckpointStore, checkpointsCleaner, checkpointIdCounter, rpcTimeout, blobWriter, log, shuffleMaster, jobMasterPartitionTracker, partitionLocationConstraint, executionDeploymentListener, combinedExecutionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore, checkpointStatsTrackerFactory, isDynamicGraph);
final CheckpointCoordinator checkpointCoordinator = newExecutionGraph.getCheckpointCoordinator();
if (checkpointCoordinator != null) {
// check whether we find a valid checkpoint
if (!checkpointCoordinator.restoreInitialCheckpointIfPresent(new HashSet<>(newExecutionGraph.getAllVertices().values()))) {
// check whether we can restore from a savepoint
tryRestoreExecutionGraphFromSavepoint(newExecutionGraph, jobGraph.getSavepointRestoreSettings());
}
}
return newExecutionGraph;
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.
/**
* Visible for re-use in {@link
* org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
*/
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
final CountDownLatch cleanerClosed = new CountDownLatch(1);
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {
@Override
public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus, checkpointsCleaner);
}
};
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {
@Override
public void shutdown(JobStatus jobStatus) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus);
}
};
final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {
@Override
public synchronized CompletableFuture<Void> closeAsync() {
cleanerClosed.countDown();
return super.closeAsync();
}
};
final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
final CountDownLatch schedulerClosing = new CountDownLatch(1);
executorService.submit(() -> {
scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
schedulerClosing.countDown();
});
// Wait for scheduler to start closing.
schedulerClosing.await();
assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
checkpointServicesShutdownBlocked.countDown();
cleanerClosed.await();
schedulerClosed.get();
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class AdaptiveSchedulerTest method testExceptionHistoryWithTaskFailureFromStopWithSavepoint.
@Test
public void testExceptionHistoryWithTaskFailureFromStopWithSavepoint() throws Exception {
final Exception expectedException = new Exception("Expected Local Exception");
Consumer<JobGraph> setupJobGraph = jobGraph -> jobGraph.setSnapshotSettings(new JobCheckpointingSettings(CheckpointCoordinatorConfiguration.builder().build(), null));
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
Consumer<AdaptiveSchedulerBuilder> setupScheduler = builder -> builder.setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner);
BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic = (scheduler, attemptIds) -> {
final ExecutionAttemptID attemptId = attemptIds.get(1);
scheduler.stopWithSavepoint("file:///tmp/target", true, SavepointFormatType.CANONICAL);
scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.FAILED, expectedException)));
};
final Iterable<RootExceptionHistoryEntry> actualExceptionHistory = runExceptionHistoryTests(testLogic, setupScheduler, setupJobGraph);
assertThat(actualExceptionHistory).hasSize(1);
final RootExceptionHistoryEntry failure = actualExceptionHistory.iterator().next();
assertThat(failure.getException().deserializeError(classLoader)).isEqualTo(expectedException);
}
use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.
the class SchedulerUtilsTest method testSharedStateRegistration.
/**
* Check that a {@link SharedStateRegistryFactory} used by {@link SchedulerUtils} registers
* shared checkpoint state on restore.
*/
@Test
public void testSharedStateRegistration() throws Exception {
UUID backendId = UUID.randomUUID();
StateHandleID key = new StateHandleID("k0");
StreamStateHandle handle = new ByteStreamStateHandle("h0", new byte[] { 1, 2, 3 });
CheckpointRecoveryFactory recoveryFactory = buildRecoveryFactory(buildCheckpoint(buildIncrementalHandle(key, handle, backendId)));
CompletedCheckpointStore checkpointStore = SchedulerUtils.createCompletedCheckpointStore(new Configuration(), recoveryFactory, Executors.directExecutor(), log, new JobID());
SharedStateRegistry sharedStateRegistry = checkpointStore.getSharedStateRegistry();
IncrementalRemoteKeyedStateHandle newHandle = buildIncrementalHandle(key, new PlaceholderStreamStateHandle(1L), backendId);
newHandle.registerSharedStates(sharedStateRegistry, 1L);
assertSame(handle, newHandle.getSharedState().get(key));
}
Aggregations