Search in sources :

Example 1 with CompletedCheckpointStore

use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.

the class ExecutionGraphBuilder method buildGraph.

/**
	 * Builds the ExecutionGraph from the JobGraph.
	 * If a prior execution graph exists, the JobGraph will be attached. If no prior execution
	 * graph exists, then the JobGraph will become attach to a new empty execution graph.
	 */
public static ExecutionGraph buildGraph(@Nullable ExecutionGraph prior, JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, SlotProvider slotProvider, ClassLoader classLoader, CheckpointRecoveryFactory recoveryFactory, Time timeout, RestartStrategy restartStrategy, MetricGroup metrics, int parallelismForAutoMax, Logger log) throws JobExecutionException, JobException {
    checkNotNull(jobGraph, "job graph cannot be null");
    final String jobName = jobGraph.getName();
    final JobID jobId = jobGraph.getJobID();
    // create a new execution graph, if none exists so far
    final ExecutionGraph executionGraph;
    try {
        executionGraph = (prior != null) ? prior : new ExecutionGraph(futureExecutor, ioExecutor, jobId, jobName, jobGraph.getJobConfiguration(), jobGraph.getSerializedExecutionConfig(), timeout, restartStrategy, jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths(), slotProvider, classLoader, metrics);
    } catch (IOException e) {
        throw new JobException("Could not create the execution graph.", e);
    }
    // set the basic properties
    executionGraph.setScheduleMode(jobGraph.getScheduleMode());
    executionGraph.setQueuedSchedulingAllowed(jobGraph.getAllowQueuedScheduling());
    try {
        executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
    } catch (Throwable t) {
        log.warn("Cannot create JSON plan for job", t);
        // give the graph an empty plan
        executionGraph.setJsonPlan("{}");
    }
    // initialize the vertices that have a master initialization hook
    // file output formats create directories here, input formats create splits
    final long initMasterStart = System.nanoTime();
    log.info("Running initialization on master for job {} ({}).", jobName, jobId);
    for (JobVertex vertex : jobGraph.getVertices()) {
        String executableClass = vertex.getInvokableClassName();
        if (executableClass == null || executableClass.isEmpty()) {
            throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
        }
        if (vertex.getParallelism() == ExecutionConfig.PARALLELISM_AUTO_MAX) {
            vertex.setParallelism(parallelismForAutoMax);
        }
        try {
            vertex.initializeOnMaster(classLoader);
        } catch (Throwable t) {
            throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
        }
    }
    log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
    // topologically sort the job vertices and attach the graph to the existing one
    List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
    if (log.isDebugEnabled()) {
        log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
    }
    executionGraph.attachJobGraph(sortedTopology);
    if (log.isDebugEnabled()) {
        log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
    }
    // configure the state checkpointing
    JobSnapshottingSettings snapshotSettings = jobGraph.getSnapshotSettings();
    if (snapshotSettings != null) {
        List<ExecutionJobVertex> triggerVertices = idToVertex(snapshotSettings.getVerticesToTrigger(), executionGraph);
        List<ExecutionJobVertex> ackVertices = idToVertex(snapshotSettings.getVerticesToAcknowledge(), executionGraph);
        List<ExecutionJobVertex> confirmVertices = idToVertex(snapshotSettings.getVerticesToConfirm(), executionGraph);
        CompletedCheckpointStore completedCheckpoints;
        CheckpointIDCounter checkpointIdCounter;
        try {
            int maxNumberOfCheckpointsToRetain = jobManagerConfig.getInteger(CoreOptions.MAX_RETAINED_CHECKPOINTS);
            if (maxNumberOfCheckpointsToRetain <= 0) {
                // warning and use 1 as the default value if the setting in
                // state.checkpoints.max-retained-checkpoints is not greater than 0.
                log.warn("The setting for '{} : {}' is invalid. Using default value of {}", CoreOptions.MAX_RETAINED_CHECKPOINTS.key(), maxNumberOfCheckpointsToRetain, CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue());
                maxNumberOfCheckpointsToRetain = CoreOptions.MAX_RETAINED_CHECKPOINTS.defaultValue();
            }
            completedCheckpoints = recoveryFactory.createCheckpointStore(jobId, maxNumberOfCheckpointsToRetain, classLoader);
            checkpointIdCounter = recoveryFactory.createCheckpointIDCounter(jobId);
        } catch (Exception e) {
            throw new JobExecutionException(jobId, "Failed to initialize high-availability checkpoint handler", e);
        }
        // Maximum number of remembered checkpoints
        int historySize = jobManagerConfig.getInteger(ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE, ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE);
        CheckpointStatsTracker checkpointStatsTracker = new CheckpointStatsTracker(historySize, ackVertices, snapshotSettings, metrics);
        // The default directory for externalized checkpoints
        String externalizedCheckpointsDir = jobManagerConfig.getString(ConfigConstants.CHECKPOINTS_DIRECTORY_KEY, null);
        // load the state backend for checkpoint metadata.
        // if specified in the application, use from there, otherwise load from configuration
        final StateBackend metadataBackend;
        final StateBackend applicationConfiguredBackend = snapshotSettings.getDefaultStateBackend();
        if (applicationConfiguredBackend != null) {
            metadataBackend = applicationConfiguredBackend;
            log.info("Using application-defined state backend for checkpoint/savepoint metadata: {}.", applicationConfiguredBackend);
        } else {
            try {
                metadataBackend = AbstractStateBackend.loadStateBackendFromConfigOrCreateDefault(jobManagerConfig, classLoader, log);
            } catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
                throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
            }
        }
        executionGraph.enableCheckpointing(snapshotSettings.getCheckpointInterval(), snapshotSettings.getCheckpointTimeout(), snapshotSettings.getMinPauseBetweenCheckpoints(), snapshotSettings.getMaxConcurrentCheckpoints(), snapshotSettings.getExternalizedCheckpointSettings(), triggerVertices, ackVertices, confirmVertices, checkpointIdCounter, completedCheckpoints, externalizedCheckpointsDir, metadataBackend, checkpointStatsTracker);
    }
    return executionGraph;
}
Also used : CheckpointStatsTracker(org.apache.flink.runtime.checkpoint.CheckpointStatsTracker) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) IOException(java.io.IOException) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) JobException(org.apache.flink.runtime.JobException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) DynamicCodeLoadingException(org.apache.flink.util.DynamicCodeLoadingException) StateBackend(org.apache.flink.runtime.state.StateBackend) AbstractStateBackend(org.apache.flink.runtime.state.AbstractStateBackend) JobException(org.apache.flink.runtime.JobException) DynamicCodeLoadingException(org.apache.flink.util.DynamicCodeLoadingException) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) JobID(org.apache.flink.api.common.JobID) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore)

Example 2 with CompletedCheckpointStore

use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.

the class DefaultExecutionGraphFactory method createAndRestoreExecutionGraph.

@Override
public ExecutionGraph createAndRestoreExecutionGraph(JobGraph jobGraph, CompletedCheckpointStore completedCheckpointStore, CheckpointsCleaner checkpointsCleaner, CheckpointIDCounter checkpointIdCounter, TaskDeploymentDescriptorFactory.PartitionLocationConstraint partitionLocationConstraint, long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, ExecutionStateUpdateListener executionStateUpdateListener, Logger log) throws Exception {
    ExecutionDeploymentListener executionDeploymentListener = new ExecutionDeploymentTrackerDeploymentListenerAdapter(executionDeploymentTracker);
    ExecutionStateUpdateListener combinedExecutionStateUpdateListener = (execution, previousState, newState) -> {
        executionStateUpdateListener.onStateUpdate(execution, previousState, newState);
        if (newState.isTerminal()) {
            executionDeploymentTracker.stopTrackingDeploymentOf(execution);
        }
    };
    final ExecutionGraph newExecutionGraph = DefaultExecutionGraphBuilder.buildGraph(jobGraph, configuration, futureExecutor, ioExecutor, userCodeClassLoader, completedCheckpointStore, checkpointsCleaner, checkpointIdCounter, rpcTimeout, blobWriter, log, shuffleMaster, jobMasterPartitionTracker, partitionLocationConstraint, executionDeploymentListener, combinedExecutionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore, checkpointStatsTrackerFactory, isDynamicGraph);
    final CheckpointCoordinator checkpointCoordinator = newExecutionGraph.getCheckpointCoordinator();
    if (checkpointCoordinator != null) {
        // check whether we find a valid checkpoint
        if (!checkpointCoordinator.restoreInitialCheckpointIfPresent(new HashSet<>(newExecutionGraph.getAllVertices().values()))) {
            // check whether we can restore from a savepoint
            tryRestoreExecutionGraphFromSavepoint(newExecutionGraph, jobGraph.getSavepointRestoreSettings());
        }
    }
    return newExecutionGraph;
}
Also used : ExecutionStateUpdateListener(org.apache.flink.runtime.executiongraph.ExecutionStateUpdateListener) ShuffleMaster(org.apache.flink.runtime.shuffle.ShuffleMaster) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) TaskDeploymentDescriptorFactory(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Supplier(java.util.function.Supplier) CachingSupplier(org.apache.flink.util.function.CachingSupplier) HashSet(java.util.HashSet) JobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker) CheckpointStatsTracker(org.apache.flink.runtime.checkpoint.CheckpointStatsTracker) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) BlobWriter(org.apache.flink.runtime.blob.BlobWriter) Logger(org.slf4j.Logger) Executor(java.util.concurrent.Executor) JobManagerJobMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) Configuration(org.apache.flink.configuration.Configuration) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) ExecutionDeploymentTracker(org.apache.flink.runtime.jobmaster.ExecutionDeploymentTracker) DefaultExecutionGraphBuilder(org.apache.flink.runtime.executiongraph.DefaultExecutionGraphBuilder) ExecutionDeploymentTrackerDeploymentListenerAdapter(org.apache.flink.runtime.jobmaster.ExecutionDeploymentTrackerDeploymentListenerAdapter) VertexAttemptNumberStore(org.apache.flink.runtime.executiongraph.VertexAttemptNumberStore) ExecutionDeploymentListener(org.apache.flink.runtime.executiongraph.ExecutionDeploymentListener) WebOptions(org.apache.flink.configuration.WebOptions) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) Time(org.apache.flink.api.common.time.Time) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ExecutionDeploymentListener(org.apache.flink.runtime.executiongraph.ExecutionDeploymentListener) ExecutionDeploymentTrackerDeploymentListenerAdapter(org.apache.flink.runtime.jobmaster.ExecutionDeploymentTrackerDeploymentListenerAdapter) ExecutionStateUpdateListener(org.apache.flink.runtime.executiongraph.ExecutionStateUpdateListener) HashSet(java.util.HashSet)

Example 3 with CompletedCheckpointStore

use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.

the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.

/**
 * Visible for re-use in {@link
 * org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
 */
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
    final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
    final CountDownLatch cleanerClosed = new CountDownLatch(1);
    final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {

        @Override
        public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus, checkpointsCleaner);
        }
    };
    final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {

        @Override
        public void shutdown(JobStatus jobStatus) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus);
        }
    };
    final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {

        @Override
        public synchronized CompletableFuture<Void> closeAsync() {
            cleanerClosed.countDown();
            return super.closeAsync();
        }
    };
    final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
    final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
    final CountDownLatch schedulerClosing = new CountDownLatch(1);
    executorService.submit(() -> {
        scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
        schedulerClosing.countDown();
    });
    // Wait for scheduler to start closing.
    schedulerClosing.await();
    assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
    checkpointServicesShutdownBlocked.countDown();
    cleanerClosed.await();
    schedulerClosed.get();
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CompletableFuture(java.util.concurrent.CompletableFuture) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CountDownLatch(java.util.concurrent.CountDownLatch) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter)

Example 4 with CompletedCheckpointStore

use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.

the class AdaptiveSchedulerTest method testExceptionHistoryWithTaskFailureFromStopWithSavepoint.

@Test
public void testExceptionHistoryWithTaskFailureFromStopWithSavepoint() throws Exception {
    final Exception expectedException = new Exception("Expected Local Exception");
    Consumer<JobGraph> setupJobGraph = jobGraph -> jobGraph.setSnapshotSettings(new JobCheckpointingSettings(CheckpointCoordinatorConfiguration.builder().build(), null));
    final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
    final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
    final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
    TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
    Consumer<AdaptiveSchedulerBuilder> setupScheduler = builder -> builder.setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner);
    BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic = (scheduler, attemptIds) -> {
        final ExecutionAttemptID attemptId = attemptIds.get(1);
        scheduler.stopWithSavepoint("file:///tmp/target", true, SavepointFormatType.CANONICAL);
        scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.FAILED, expectedException)));
    };
    final Iterable<RootExceptionHistoryEntry> actualExceptionHistory = runExceptionHistoryTests(testLogic, setupScheduler, setupJobGraph);
    assertThat(actualExceptionHistory).hasSize(1);
    final RootExceptionHistoryEntry failure = actualExceptionHistory.iterator().next();
    assertThat(failure.getException().deserializeError(classLoader)).isEqualTo(expectedException);
}
Also used : Arrays(java.util.Arrays) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) ArchivedExecution(org.apache.flink.runtime.executiongraph.ArchivedExecution) TestingSlotAllocator(org.apache.flink.runtime.scheduler.adaptive.allocator.TestingSlotAllocator) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) Duration(java.time.Duration) ClassRule(org.junit.ClassRule) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) ManuallyTriggeredComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ManuallyTriggeredComponentMainThreadExecutor) BlockingQueue(java.util.concurrent.BlockingQueue) MetricOptions(org.apache.flink.configuration.MetricOptions) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Executors(java.util.concurrent.Executors) MetricNames(org.apache.flink.runtime.metrics.MetricNames) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) VertexParallelismStore(org.apache.flink.runtime.scheduler.VertexParallelismStore) Time(org.apache.flink.api.common.time.Time) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) ExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.ExceptionHistoryEntry) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) JobStatus(org.apache.flink.api.common.JobStatus) DefaultAllocatedSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultAllocatedSlotPool) ArrayList(java.util.ArrayList) SchedulerNG(org.apache.flink.runtime.scheduler.SchedulerNG) DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) Gauge(org.apache.flink.metrics.Gauge) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) BiConsumer(java.util.function.BiConsumer) FixedDelayRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.FixedDelayRestartBackoffTimeStrategy) Nullable(javax.annotation.Nullable) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) TestExecutorResource(org.apache.flink.testutils.executor.TestExecutorResource) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) TestOperatorEvent(org.apache.flink.runtime.operators.coordination.TestOperatorEvent) Test(org.junit.Test) IOException(java.io.IOException) IterableUtils(org.apache.flink.util.IterableUtils) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) ExecutionException(java.util.concurrent.ExecutionException) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) JobID(org.apache.flink.api.common.JobID) NoRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.NoRestartBackoffTimeStrategy) SlotPoolTestUtils.offerSlots(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestLogger(org.apache.flink.util.TestLogger) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) TestingCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.TestingCompletedCheckpointStore) TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) DefaultDeclarativeSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool) List(java.util.List) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CoordinationRequest(org.apache.flink.runtime.operators.coordination.CoordinationRequest) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) SchedulerExecutionMode(org.apache.flink.configuration.SchedulerExecutionMode) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CompletableFuture(java.util.concurrent.CompletableFuture) VertexParallelismInformation(org.apache.flink.runtime.scheduler.VertexParallelismInformation) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TestingCheckpointIDCounter(org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) ArchivedExecutionGraphBuilder(org.apache.flink.runtime.rest.handler.legacy.utils.ArchivedExecutionGraphBuilder) Logger(org.slf4j.Logger) Configuration(org.apache.flink.configuration.Configuration) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionGraphTestUtils.createNoOpVertex(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.createNoOpVertex) SchedulerTestingUtils.enableCheckpointing(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.enableCheckpointing) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) TemporaryFolder(org.junit.rules.TemporaryFolder) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) FlinkException(org.apache.flink.util.FlinkException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) ArrayList(java.util.ArrayList) List(java.util.List) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) TestingCheckpointIDCounter(org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter) TestingCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.TestingCompletedCheckpointStore) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest)

Example 5 with CompletedCheckpointStore

use of org.apache.flink.runtime.checkpoint.CompletedCheckpointStore in project flink by apache.

the class SchedulerUtilsTest method testSharedStateRegistration.

/**
 * Check that a {@link SharedStateRegistryFactory} used by {@link SchedulerUtils} registers
 * shared checkpoint state on restore.
 */
@Test
public void testSharedStateRegistration() throws Exception {
    UUID backendId = UUID.randomUUID();
    StateHandleID key = new StateHandleID("k0");
    StreamStateHandle handle = new ByteStreamStateHandle("h0", new byte[] { 1, 2, 3 });
    CheckpointRecoveryFactory recoveryFactory = buildRecoveryFactory(buildCheckpoint(buildIncrementalHandle(key, handle, backendId)));
    CompletedCheckpointStore checkpointStore = SchedulerUtils.createCompletedCheckpointStore(new Configuration(), recoveryFactory, Executors.directExecutor(), log, new JobID());
    SharedStateRegistry sharedStateRegistry = checkpointStore.getSharedStateRegistry();
    IncrementalRemoteKeyedStateHandle newHandle = buildIncrementalHandle(key, new PlaceholderStreamStateHandle(1L), backendId);
    newHandle.registerSharedStates(sharedStateRegistry, 1L);
    assertSame(handle, newHandle.getSharedState().get(key));
}
Also used : PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) Configuration(org.apache.flink.configuration.Configuration) StateHandleID(org.apache.flink.runtime.state.StateHandleID) IncrementalRemoteKeyedStateHandle(org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) UUID(java.util.UUID) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) JobID(org.apache.flink.api.common.JobID) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) EmbeddedCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.EmbeddedCompletedCheckpointStore) SharedStateRegistry(org.apache.flink.runtime.state.SharedStateRegistry) Test(org.junit.Test)

Aggregations

CompletedCheckpointStore (org.apache.flink.runtime.checkpoint.CompletedCheckpointStore)10 CheckpointIDCounter (org.apache.flink.runtime.checkpoint.CheckpointIDCounter)7 JobID (org.apache.flink.api.common.JobID)5 Configuration (org.apache.flink.configuration.Configuration)5 Test (org.junit.Test)5 CompletableFuture (java.util.concurrent.CompletableFuture)4 JobStatus (org.apache.flink.api.common.JobStatus)4 StandaloneCheckpointIDCounter (org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter)4 StandaloneCompletedCheckpointStore (org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore)4 TestingCheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory)4 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)4 IOException (java.io.IOException)3 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)3 Time (org.apache.flink.api.common.time.Time)3 CheckpointsCleaner (org.apache.flink.runtime.checkpoint.CheckpointsCleaner)3 Duration (java.time.Duration)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Optional (java.util.Optional)2