Search in sources :

Example 1 with CheckpointRecoveryFactory

use of org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory in project flink by apache.

the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.

/**
	 * Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
	 * loses its leadership. Furthermore, it tests that the job manager can recover the job from
	 * the SubmittedJobGraphStore and checkpoint state is recovered as well.
	 */
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    int slots = 2;
    ActorRef archive = null;
    ActorRef jobManager = null;
    ActorRef taskManager = null;
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
    flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
        MyCheckpointStore checkpointStore = new MyCheckpointStore();
        CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
        CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
        TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
        InstanceManager instanceManager = new InstanceManager();
        instanceManager.addInstanceListener(scheduler);
        archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
        Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
        jobManager = system.actorOf(jobManagerProps);
        ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
        taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
        ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
        Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
        Await.ready(tmAlive, deadline.timeLeft());
        JobVertex sourceJobVertex = new JobVertex("Source");
        sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceJobVertex.setParallelism(slots);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
        List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
        jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
        BlockingStatefulInvokable.initializeStaticHelpers(slots);
        Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
        Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
        // tell jobManager that he's the leader
        myLeaderElectionService.isLeader(leaderSessionID);
        // tell taskManager who's the leader
        myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
        Await.ready(isLeader, deadline.timeLeft());
        Await.ready(isConnectedToJobManager, deadline.timeLeft());
        // submit blocking job
        Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
        Await.ready(jobSubmitted, deadline.timeLeft());
        // Wait for some checkpoints to complete
        BlockingStatefulInvokable.awaitCompletedCheckpoints();
        Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        // Revoke leadership
        myLeaderElectionService.notLeader();
        // check that the job gets removed from the JobManager
        Await.ready(jobRemoved, deadline.timeLeft());
        // but stays in the submitted job graph store
        assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
        // Make JobManager again a leader
        myLeaderElectionService.isLeader(newLeaderSessionID);
        // tell the TaskManager about it
        myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
        // wait that the job is recovered and reaches state RUNNING
        Await.ready(jobRunning, deadline.timeLeft());
        Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        BlockingInvokable.unblock();
        // wait til the job has finished
        Await.ready(jobFinished, deadline.timeLeft());
        // check that the job has been removed from the submitted job graph store
        assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        // Check that state has been recovered
        long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
        for (long state : recoveredStates) {
            boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
            assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
        }
    } finally {
        if (archive != null) {
            archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) ActorRef(akka.actor.ActorRef) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Props(akka.actor.Props) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TestingTaskManager(org.apache.flink.runtime.testingUtils.TestingTaskManager) BlobServer(org.apache.flink.runtime.blob.BlobServer) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) UUID(java.util.UUID) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Deadline(scala.concurrent.duration.Deadline) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 2 with CheckpointRecoveryFactory

use of org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory in project flink by apache.

the class JobManagerLeaderElectionTest method createJobManagerProps.

private Props createJobManagerProps(Configuration configuration) throws Exception {
    LeaderElectionService leaderElectionService;
    if (HighAvailabilityMode.fromConfig(configuration) == HighAvailabilityMode.NONE) {
        leaderElectionService = new StandaloneLeaderElectionService();
    } else {
        CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration);
        leaderElectionService = ZooKeeperUtils.createLeaderElectionService(client, configuration);
    }
    // We don't need recovery in this test
    SubmittedJobGraphStore submittedJobGraphStore = new StandaloneSubmittedJobGraphStore();
    CheckpointRecoveryFactory checkpointRecoveryFactory = new StandaloneCheckpointRecoveryFactory();
    return Props.create(TestingJobManager.class, configuration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new InstanceManager(), new Scheduler(TestingUtils.defaultExecutionContext()), new BlobLibraryCacheManager(new BlobServer(configuration), 10L), ActorRef.noSender(), new NoRestartStrategy.NoRestartStrategyFactory(), AkkaUtils.getDefaultTimeoutAsFiniteDuration(), leaderElectionService, submittedJobGraphStore, checkpointRecoveryFactory, AkkaUtils.getDefaultTimeoutAsFiniteDuration(), Option.apply(null));
}
Also used : StandaloneSubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.StandaloneSubmittedJobGraphStore) SubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.SubmittedJobGraphStore) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) StandaloneSubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.StandaloneSubmittedJobGraphStore) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) CuratorFramework(org.apache.curator.framework.CuratorFramework) BlobServer(org.apache.flink.runtime.blob.BlobServer)

Example 3 with CheckpointRecoveryFactory

use of org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory in project flink by apache.

the class SchedulerUtilsTest method testSharedStateRegistration.

/**
 * Check that a {@link SharedStateRegistryFactory} used by {@link SchedulerUtils} registers
 * shared checkpoint state on restore.
 */
@Test
public void testSharedStateRegistration() throws Exception {
    UUID backendId = UUID.randomUUID();
    StateHandleID key = new StateHandleID("k0");
    StreamStateHandle handle = new ByteStreamStateHandle("h0", new byte[] { 1, 2, 3 });
    CheckpointRecoveryFactory recoveryFactory = buildRecoveryFactory(buildCheckpoint(buildIncrementalHandle(key, handle, backendId)));
    CompletedCheckpointStore checkpointStore = SchedulerUtils.createCompletedCheckpointStore(new Configuration(), recoveryFactory, Executors.directExecutor(), log, new JobID());
    SharedStateRegistry sharedStateRegistry = checkpointStore.getSharedStateRegistry();
    IncrementalRemoteKeyedStateHandle newHandle = buildIncrementalHandle(key, new PlaceholderStreamStateHandle(1L), backendId);
    newHandle.registerSharedStates(sharedStateRegistry, 1L);
    assertSame(handle, newHandle.getSharedState().get(key));
}
Also used : PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) Configuration(org.apache.flink.configuration.Configuration) StateHandleID(org.apache.flink.runtime.state.StateHandleID) IncrementalRemoteKeyedStateHandle(org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) UUID(java.util.UUID) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) JobID(org.apache.flink.api.common.JobID) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) EmbeddedCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.EmbeddedCompletedCheckpointStore) SharedStateRegistry(org.apache.flink.runtime.state.SharedStateRegistry) Test(org.junit.Test)

Example 4 with CheckpointRecoveryFactory

use of org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory in project flink by apache.

the class JobMasterTest method testCheckpointPrecedesSavepointRecovery.

/**
 * Tests that an existing checkpoint will have precedence over an savepoint.
 */
@Test
public void testCheckpointPrecedesSavepointRecovery() throws Exception {
    // create savepoint data
    final long savepointId = 42L;
    final File savepointFile = createSavepoint(savepointId);
    // set savepoint settings
    final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath("" + savepointFile.getAbsolutePath(), true);
    final JobGraph jobGraph = createJobGraphWithCheckpointing(savepointRestoreSettings);
    final long checkpointId = 1L;
    final CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(jobGraph.getJobID(), checkpointId, 1L, 1L, Collections.emptyMap(), null, CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new DummyCheckpointStorageLocation());
    final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
    completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
    });
    final CheckpointRecoveryFactory testingCheckpointRecoveryFactory = PerJobCheckpointRecoveryFactory.withoutCheckpointStoreRecovery(maxCheckpoints -> completedCheckpointStore);
    haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).createJobMaster();
    try {
        // starting the JobMaster should have read the savepoint
        final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();
        assertThat(savepointCheckpoint, Matchers.notNullValue());
        assertThat(savepointCheckpoint.getCheckpointID(), is(checkpointId));
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
    }
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) File(java.io.File) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) Test(org.junit.Test)

Example 5 with CheckpointRecoveryFactory

use of org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory in project flink by apache.

the class JobMasterTest method testRestoringFromSavepoint.

/**
 * Tests that a JobMaster will restore the given JobGraph from its savepoint upon initial
 * submission.
 */
@Test
public void testRestoringFromSavepoint() throws Exception {
    // create savepoint data
    final long savepointId = 42L;
    final File savepointFile = createSavepoint(savepointId);
    // set savepoint settings
    final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath(savepointFile.getAbsolutePath(), true);
    final JobGraph jobGraph = createJobGraphWithCheckpointing(savepointRestoreSettings);
    final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
    final CheckpointRecoveryFactory testingCheckpointRecoveryFactory = PerJobCheckpointRecoveryFactory.withoutCheckpointStoreRecovery(maxCheckpoints -> completedCheckpointStore);
    haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);
    final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withHighAvailabilityServices(haServices).createJobMaster();
    try {
        // we need to start and register the required slots to let the adaptive scheduler
        // restore from the savepoint
        jobMaster.start();
        final OneShotLatch taskSubmitLatch = new OneShotLatch();
        registerSlotsAtJobMaster(1, jobMaster.getSelfGateway(JobMasterGateway.class), jobGraph.getJobID(), new TestingTaskExecutorGatewayBuilder().setSubmitTaskConsumer((taskDeploymentDescriptor, jobMasterId) -> {
            taskSubmitLatch.trigger();
            return CompletableFuture.completedFuture(Acknowledge.get());
        }).createTestingTaskExecutorGateway(), new LocalUnresolvedTaskManagerLocation());
        // wait until a task has submitted because this guarantees that the ExecutionGraph has
        // been created
        taskSubmitLatch.await();
        final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();
        assertThat(savepointCheckpoint, Matchers.notNullValue());
        assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId));
    } finally {
        RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
    }
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) PerJobCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) JobMasterBuilder(org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) File(java.io.File) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) Test(org.junit.Test)

Aggregations

CheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory)5 StandaloneCheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory)4 Test (org.junit.Test)4 CompletedCheckpoint (org.apache.flink.runtime.checkpoint.CompletedCheckpoint)3 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)3 File (java.io.File)2 UUID (java.util.UUID)2 Configuration (org.apache.flink.configuration.Configuration)2 BlobServer (org.apache.flink.runtime.blob.BlobServer)2 PerJobCheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.PerJobCheckpointRecoveryFactory)2 StandaloneCompletedCheckpointStore (org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore)2 BlobLibraryCacheManager (org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager)2 InstanceManager (org.apache.flink.runtime.instance.InstanceManager)2 SavepointRestoreSettings (org.apache.flink.runtime.jobgraph.SavepointRestoreSettings)2 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)2 JobMasterBuilder (org.apache.flink.runtime.jobmaster.utils.JobMasterBuilder)2 ActorRef (akka.actor.ActorRef)1 Props (akka.actor.Props)1 CuratorFramework (org.apache.curator.framework.CuratorFramework)1 JobID (org.apache.flink.api.common.JobID)1