Search in sources :

Example 1 with StandaloneCheckpointIDCounter

use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.

the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.

/**
	 * Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
	 * loses its leadership. Furthermore, it tests that the job manager can recover the job from
	 * the SubmittedJobGraphStore and checkpoint state is recovered as well.
	 */
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    int slots = 2;
    ActorRef archive = null;
    ActorRef jobManager = null;
    ActorRef taskManager = null;
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
    flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
        MyCheckpointStore checkpointStore = new MyCheckpointStore();
        CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
        CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
        TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
        InstanceManager instanceManager = new InstanceManager();
        instanceManager.addInstanceListener(scheduler);
        archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
        Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
        jobManager = system.actorOf(jobManagerProps);
        ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
        taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
        ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
        Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
        Await.ready(tmAlive, deadline.timeLeft());
        JobVertex sourceJobVertex = new JobVertex("Source");
        sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceJobVertex.setParallelism(slots);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
        List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
        jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
        BlockingStatefulInvokable.initializeStaticHelpers(slots);
        Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
        Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
        // tell jobManager that he's the leader
        myLeaderElectionService.isLeader(leaderSessionID);
        // tell taskManager who's the leader
        myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
        Await.ready(isLeader, deadline.timeLeft());
        Await.ready(isConnectedToJobManager, deadline.timeLeft());
        // submit blocking job
        Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
        Await.ready(jobSubmitted, deadline.timeLeft());
        // Wait for some checkpoints to complete
        BlockingStatefulInvokable.awaitCompletedCheckpoints();
        Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        // Revoke leadership
        myLeaderElectionService.notLeader();
        // check that the job gets removed from the JobManager
        Await.ready(jobRemoved, deadline.timeLeft());
        // but stays in the submitted job graph store
        assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
        // Make JobManager again a leader
        myLeaderElectionService.isLeader(newLeaderSessionID);
        // tell the TaskManager about it
        myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
        // wait that the job is recovered and reaches state RUNNING
        Await.ready(jobRunning, deadline.timeLeft());
        Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        BlockingInvokable.unblock();
        // wait til the job has finished
        Await.ready(jobFinished, deadline.timeLeft());
        // check that the job has been removed from the submitted job graph store
        assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        // Check that state has been recovered
        long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
        for (long state : recoveredStates) {
            boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
            assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
        }
    } finally {
        if (archive != null) {
            archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) ActorRef(akka.actor.ActorRef) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Props(akka.actor.Props) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TestingTaskManager(org.apache.flink.runtime.testingUtils.TestingTaskManager) BlobServer(org.apache.flink.runtime.blob.BlobServer) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) UUID(java.util.UUID) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Deadline(scala.concurrent.duration.Deadline) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 2 with StandaloneCheckpointIDCounter

use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.

the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.

/**
 * Visible for re-use in {@link
 * org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
 */
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
    final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
    final CountDownLatch cleanerClosed = new CountDownLatch(1);
    final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {

        @Override
        public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus, checkpointsCleaner);
        }
    };
    final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {

        @Override
        public void shutdown(JobStatus jobStatus) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus);
        }
    };
    final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {

        @Override
        public synchronized CompletableFuture<Void> closeAsync() {
            cleanerClosed.countDown();
            return super.closeAsync();
        }
    };
    final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
    final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
    final CountDownLatch schedulerClosing = new CountDownLatch(1);
    executorService.submit(() -> {
        scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
        schedulerClosing.countDown();
    });
    // Wait for scheduler to start closing.
    schedulerClosing.await();
    assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
    checkpointServicesShutdownBlocked.countDown();
    cleanerClosed.await();
    schedulerClosed.get();
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CompletableFuture(java.util.concurrent.CompletableFuture) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CountDownLatch(java.util.concurrent.CountDownLatch) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter)

Example 3 with StandaloneCheckpointIDCounter

use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.

the class AdaptiveSchedulerTest method testExceptionHistoryWithTaskFailureFromStopWithSavepoint.

@Test
public void testExceptionHistoryWithTaskFailureFromStopWithSavepoint() throws Exception {
    final Exception expectedException = new Exception("Expected Local Exception");
    Consumer<JobGraph> setupJobGraph = jobGraph -> jobGraph.setSnapshotSettings(new JobCheckpointingSettings(CheckpointCoordinatorConfiguration.builder().build(), null));
    final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
    final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
    final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
    TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
    Consumer<AdaptiveSchedulerBuilder> setupScheduler = builder -> builder.setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner);
    BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic = (scheduler, attemptIds) -> {
        final ExecutionAttemptID attemptId = attemptIds.get(1);
        scheduler.stopWithSavepoint("file:///tmp/target", true, SavepointFormatType.CANONICAL);
        scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.FAILED, expectedException)));
    };
    final Iterable<RootExceptionHistoryEntry> actualExceptionHistory = runExceptionHistoryTests(testLogic, setupScheduler, setupJobGraph);
    assertThat(actualExceptionHistory).hasSize(1);
    final RootExceptionHistoryEntry failure = actualExceptionHistory.iterator().next();
    assertThat(failure.getException().deserializeError(classLoader)).isEqualTo(expectedException);
}
Also used : Arrays(java.util.Arrays) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) ArchivedExecution(org.apache.flink.runtime.executiongraph.ArchivedExecution) TestingSlotAllocator(org.apache.flink.runtime.scheduler.adaptive.allocator.TestingSlotAllocator) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) Duration(java.time.Duration) ClassRule(org.junit.ClassRule) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) ManuallyTriggeredComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ManuallyTriggeredComponentMainThreadExecutor) BlockingQueue(java.util.concurrent.BlockingQueue) MetricOptions(org.apache.flink.configuration.MetricOptions) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Executors(java.util.concurrent.Executors) MetricNames(org.apache.flink.runtime.metrics.MetricNames) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) VertexParallelismStore(org.apache.flink.runtime.scheduler.VertexParallelismStore) Time(org.apache.flink.api.common.time.Time) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) ExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.ExceptionHistoryEntry) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) JobStatus(org.apache.flink.api.common.JobStatus) DefaultAllocatedSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultAllocatedSlotPool) ArrayList(java.util.ArrayList) SchedulerNG(org.apache.flink.runtime.scheduler.SchedulerNG) DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) Gauge(org.apache.flink.metrics.Gauge) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) BiConsumer(java.util.function.BiConsumer) FixedDelayRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.FixedDelayRestartBackoffTimeStrategy) Nullable(javax.annotation.Nullable) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) TestExecutorResource(org.apache.flink.testutils.executor.TestExecutorResource) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) TestOperatorEvent(org.apache.flink.runtime.operators.coordination.TestOperatorEvent) Test(org.junit.Test) IOException(java.io.IOException) IterableUtils(org.apache.flink.util.IterableUtils) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) ExecutionException(java.util.concurrent.ExecutionException) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) JobID(org.apache.flink.api.common.JobID) NoRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.NoRestartBackoffTimeStrategy) SlotPoolTestUtils.offerSlots(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestLogger(org.apache.flink.util.TestLogger) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) TestingCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.TestingCompletedCheckpointStore) TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) DefaultDeclarativeSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool) List(java.util.List) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CoordinationRequest(org.apache.flink.runtime.operators.coordination.CoordinationRequest) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) SchedulerExecutionMode(org.apache.flink.configuration.SchedulerExecutionMode) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CompletableFuture(java.util.concurrent.CompletableFuture) VertexParallelismInformation(org.apache.flink.runtime.scheduler.VertexParallelismInformation) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TestingCheckpointIDCounter(org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) ArchivedExecutionGraphBuilder(org.apache.flink.runtime.rest.handler.legacy.utils.ArchivedExecutionGraphBuilder) Logger(org.slf4j.Logger) Configuration(org.apache.flink.configuration.Configuration) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionGraphTestUtils.createNoOpVertex(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.createNoOpVertex) SchedulerTestingUtils.enableCheckpointing(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.enableCheckpointing) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) TemporaryFolder(org.junit.rules.TemporaryFolder) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) FlinkException(org.apache.flink.util.FlinkException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) ArrayList(java.util.ArrayList) List(java.util.List) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) TestingCheckpointIDCounter(org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter) TestingCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.TestingCompletedCheckpointStore) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest)

Example 4 with StandaloneCheckpointIDCounter

use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.

the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointFails.

@Test
public void testRestoringModifiedJobFromSavepointFails() throws Exception {
    final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(false, 42L);
    final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
    try {
        executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, new StandaloneCompletedCheckpointStore(1), new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
        }, log);
        fail("Expected ExecutionGraph creation to fail because of non restored state.");
    } catch (Exception e) {
        assertThat(e, FlinkMatchers.containsMessage("Failed to rollback to checkpoint/savepoint"));
    }
}
Also used : DefaultVertexAttemptNumberStore(org.apache.flink.runtime.executiongraph.DefaultVertexAttemptNumberStore) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) IOException(java.io.IOException) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 5 with StandaloneCheckpointIDCounter

use of org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter in project flink by apache.

the class DefaultExecutionGraphFactoryTest method testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds.

@Test
public void testRestoringModifiedJobFromSavepointWithAllowNonRestoredStateSucceeds() throws Exception {
    // create savepoint data
    final long savepointId = 42L;
    final JobGraph jobGraphWithNewOperator = createJobGraphWithSavepoint(true, savepointId);
    final ExecutionGraphFactory executionGraphFactory = createExecutionGraphFactory();
    final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
    executionGraphFactory.createAndRestoreExecutionGraph(jobGraphWithNewOperator, completedCheckpointStore, new CheckpointsCleaner(), new StandaloneCheckpointIDCounter(), TaskDeploymentDescriptorFactory.PartitionLocationConstraint.CAN_BE_UNKNOWN, 0L, new DefaultVertexAttemptNumberStore(), SchedulerBase.computeVertexParallelismStore(jobGraphWithNewOperator), (execution, previousState, newState) -> {
    }, log);
    final CompletedCheckpoint savepoint = completedCheckpointStore.getLatestCheckpoint();
    MatcherAssert.assertThat(savepoint, notNullValue());
    MatcherAssert.assertThat(savepoint.getCheckpointID(), Matchers.is(savepointId));
}
Also used : DefaultVertexAttemptNumberStore(org.apache.flink.runtime.executiongraph.DefaultVertexAttemptNumberStore) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Aggregations

StandaloneCheckpointIDCounter (org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter)6 CheckpointsCleaner (org.apache.flink.runtime.checkpoint.CheckpointsCleaner)5 StandaloneCompletedCheckpointStore (org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore)5 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 Test (org.junit.Test)5 CheckpointIDCounter (org.apache.flink.runtime.checkpoint.CheckpointIDCounter)4 IOException (java.io.IOException)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 JobStatus (org.apache.flink.api.common.JobStatus)3 Configuration (org.apache.flink.configuration.Configuration)3 CompletedCheckpointStore (org.apache.flink.runtime.checkpoint.CompletedCheckpointStore)3 Duration (java.time.Duration)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Optional (java.util.Optional)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 BlockingQueue (java.util.concurrent.BlockingQueue)2 ExecutionException (java.util.concurrent.ExecutionException)2 Executors (java.util.concurrent.Executors)2