Search in sources :

Example 1 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.

/**
	 * Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
	 * loses its leadership. Furthermore, it tests that the job manager can recover the job from
	 * the SubmittedJobGraphStore and checkpoint state is recovered as well.
	 */
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    UUID newLeaderSessionID = UUID.randomUUID();
    int slots = 2;
    ActorRef archive = null;
    ActorRef jobManager = null;
    ActorRef taskManager = null;
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
    flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
        MyCheckpointStore checkpointStore = new MyCheckpointStore();
        CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
        CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
        TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
        InstanceManager instanceManager = new InstanceManager();
        instanceManager.addInstanceListener(scheduler);
        archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
        Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
        jobManager = system.actorOf(jobManagerProps);
        ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
        taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
        ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
        Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
        Await.ready(tmAlive, deadline.timeLeft());
        JobVertex sourceJobVertex = new JobVertex("Source");
        sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceJobVertex.setParallelism(slots);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
        List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
        jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
        BlockingStatefulInvokable.initializeStaticHelpers(slots);
        Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
        Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
        // tell jobManager that he's the leader
        myLeaderElectionService.isLeader(leaderSessionID);
        // tell taskManager who's the leader
        myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
        Await.ready(isLeader, deadline.timeLeft());
        Await.ready(isConnectedToJobManager, deadline.timeLeft());
        // submit blocking job
        Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
        Await.ready(jobSubmitted, deadline.timeLeft());
        // Wait for some checkpoints to complete
        BlockingStatefulInvokable.awaitCompletedCheckpoints();
        Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        // Revoke leadership
        myLeaderElectionService.notLeader();
        // check that the job gets removed from the JobManager
        Await.ready(jobRemoved, deadline.timeLeft());
        // but stays in the submitted job graph store
        assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
        // Make JobManager again a leader
        myLeaderElectionService.isLeader(newLeaderSessionID);
        // tell the TaskManager about it
        myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
        // wait that the job is recovered and reaches state RUNNING
        Await.ready(jobRunning, deadline.timeLeft());
        Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        BlockingInvokable.unblock();
        // wait til the job has finished
        Await.ready(jobFinished, deadline.timeLeft());
        // check that the job has been removed from the submitted job graph store
        assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
        // Check that state has been recovered
        long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
        for (long state : recoveredStates) {
            boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
            assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
        }
    } finally {
        if (archive != null) {
            archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) ActorRef(akka.actor.ActorRef) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Props(akka.actor.Props) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TestingTaskManager(org.apache.flink.runtime.testingUtils.TestingTaskManager) BlobServer(org.apache.flink.runtime.blob.BlobServer) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) UUID(java.util.UUID) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Deadline(scala.concurrent.duration.Deadline) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) Test(org.junit.Test)

Example 2 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class JobManagerLeaderElectionTest method createJobManagerProps.

private Props createJobManagerProps(Configuration configuration) throws Exception {
    LeaderElectionService leaderElectionService;
    if (HighAvailabilityMode.fromConfig(configuration) == HighAvailabilityMode.NONE) {
        leaderElectionService = new StandaloneLeaderElectionService();
    } else {
        CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration);
        leaderElectionService = ZooKeeperUtils.createLeaderElectionService(client, configuration);
    }
    // We don't need recovery in this test
    SubmittedJobGraphStore submittedJobGraphStore = new StandaloneSubmittedJobGraphStore();
    CheckpointRecoveryFactory checkpointRecoveryFactory = new StandaloneCheckpointRecoveryFactory();
    return Props.create(TestingJobManager.class, configuration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new InstanceManager(), new Scheduler(TestingUtils.defaultExecutionContext()), new BlobLibraryCacheManager(new BlobServer(configuration), 10L), ActorRef.noSender(), new NoRestartStrategy.NoRestartStrategyFactory(), AkkaUtils.getDefaultTimeoutAsFiniteDuration(), leaderElectionService, submittedJobGraphStore, checkpointRecoveryFactory, AkkaUtils.getDefaultTimeoutAsFiniteDuration(), Option.apply(null));
}
Also used : StandaloneSubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.StandaloneSubmittedJobGraphStore) SubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.SubmittedJobGraphStore) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) InstanceManager(org.apache.flink.runtime.instance.InstanceManager) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) StandaloneSubmittedJobGraphStore(org.apache.flink.runtime.jobmanager.StandaloneSubmittedJobGraphStore) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) StandaloneCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) CuratorFramework(org.apache.curator.framework.CuratorFramework) BlobServer(org.apache.flink.runtime.blob.BlobServer)

Example 3 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class JobManagerSharedServices method fromConfiguration.

// ------------------------------------------------------------------------
// Creating the components from a configuration
// ------------------------------------------------------------------------
public static JobManagerSharedServices fromConfiguration(Configuration config, BlobServer blobServer, FatalErrorHandler fatalErrorHandler) throws Exception {
    checkNotNull(config);
    checkNotNull(blobServer);
    final String classLoaderResolveOrder = config.getString(CoreOptions.CLASSLOADER_RESOLVE_ORDER);
    final String[] alwaysParentFirstLoaderPatterns = CoreOptions.getParentFirstLoaderPatterns(config);
    final boolean failOnJvmMetaspaceOomError = config.getBoolean(CoreOptions.FAIL_ON_USER_CLASS_LOADING_METASPACE_OOM);
    final boolean checkClassLoaderLeak = config.getBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER);
    final BlobLibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(blobServer, BlobLibraryCacheManager.defaultClassLoaderFactory(FlinkUserCodeClassLoaders.ResolveOrder.fromString(classLoaderResolveOrder), alwaysParentFirstLoaderPatterns, failOnJvmMetaspaceOomError ? fatalErrorHandler : null, checkClassLoaderLeak));
    final int numberCPUCores = Hardware.getNumberCPUCores();
    final int jobManagerFuturePoolSize = config.getInteger(JobManagerOptions.JOB_MANAGER_FUTURE_POOL_SIZE, numberCPUCores);
    final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(jobManagerFuturePoolSize, new ExecutorThreadFactory("jobmanager-future"));
    final int jobManagerIoPoolSize = config.getInteger(JobManagerOptions.JOB_MANAGER_IO_POOL_SIZE, numberCPUCores);
    final ExecutorService ioExecutor = Executors.newFixedThreadPool(jobManagerIoPoolSize, new ExecutorThreadFactory("jobmanager-io"));
    final ShuffleMasterContext shuffleMasterContext = new ShuffleMasterContextImpl(config, fatalErrorHandler);
    final ShuffleMaster<?> shuffleMaster = ShuffleServiceLoader.loadShuffleServiceFactory(config).createShuffleMaster(shuffleMasterContext);
    shuffleMaster.start();
    return new JobManagerSharedServices(futureExecutor, ioExecutor, libraryCacheManager, shuffleMaster, blobServer);
}
Also used : ExecutorThreadFactory(org.apache.flink.util.concurrent.ExecutorThreadFactory) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) ShuffleMasterContext(org.apache.flink.runtime.shuffle.ShuffleMasterContext) ShuffleMasterContextImpl(org.apache.flink.runtime.shuffle.ShuffleMasterContextImpl)

Example 4 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class JobManagerServices method fromConfiguration.

// ------------------------------------------------------------------------
//  Creating the components from a configuration 
// ------------------------------------------------------------------------
public static JobManagerServices fromConfiguration(Configuration config, HighAvailabilityServices haServices) throws Exception {
    final BlobServer blobServer = new BlobServer(config, haServices);
    final long cleanupInterval = config.getLong(ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL, ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000;
    final BlobLibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval);
    final FiniteDuration timeout;
    try {
        timeout = AkkaUtils.getTimeout(config);
    } catch (NumberFormatException e) {
        throw new IllegalConfigurationException(AkkaUtils.formatDurationParingErrorMessage());
    }
    final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(Hardware.getNumberCPUCores(), new ExecutorThreadFactory("jobmanager-future"));
    return new JobManagerServices(futureExecutor, libraryCacheManager, RestartStrategyFactory.createRestartStrategyFactory(config), Time.of(timeout.length(), timeout.unit()));
}
Also used : ExecutorThreadFactory(org.apache.flink.runtime.util.ExecutorThreadFactory) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) FiniteDuration(scala.concurrent.duration.FiniteDuration) BlobServer(org.apache.flink.runtime.blob.BlobServer)

Example 5 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class TaskExecutor method associateWithJobManager.

private JobManagerConnection associateWithJobManager(JobID jobID, ResourceID resourceID, JobMasterGateway jobMasterGateway, UUID jobManagerLeaderId, int blobPort) {
    Preconditions.checkNotNull(jobID);
    Preconditions.checkNotNull(resourceID);
    Preconditions.checkNotNull(jobManagerLeaderId);
    Preconditions.checkNotNull(jobMasterGateway);
    Preconditions.checkArgument(blobPort > 0 || blobPort < MAX_BLOB_PORT, "Blob server port is out of range.");
    TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobManagerLeaderId, jobMasterGateway);
    CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway);
    InetSocketAddress blobServerAddress = new InetSocketAddress(jobMasterGateway.getHostname(), blobPort);
    final LibraryCacheManager libraryCacheManager;
    try {
        final BlobCache blobCache = new BlobCache(blobServerAddress, taskManagerConfiguration.getConfiguration(), haServices);
        libraryCacheManager = new BlobLibraryCacheManager(blobCache, taskManagerConfiguration.getCleanupInterval());
    } catch (IOException e) {
        // Can't pass the IOException up - we need a RuntimeException anyway
        // two levels up where this is run asynchronously. Also, we don't
        // know whether this is caught in the thread running this method.
        final String message = "Could not create BLOB cache or library cache.";
        log.error(message, e);
        throw new RuntimeException(message, e);
    }
    ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier(jobManagerLeaderId, jobMasterGateway, getRpcService().getExecutor(), taskManagerConfiguration.getTimeout());
    PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobManagerLeaderId, jobMasterGateway);
    return new JobManagerConnection(jobID, resourceID, jobMasterGateway, jobManagerLeaderId, taskManagerActions, checkpointResponder, libraryCacheManager, resultPartitionConsumableNotifier, partitionStateChecker);
}
Also used : BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) RpcCheckpointResponder(org.apache.flink.runtime.taskexecutor.rpc.RpcCheckpointResponder) RpcCheckpointResponder(org.apache.flink.runtime.taskexecutor.rpc.RpcCheckpointResponder) InetSocketAddress(java.net.InetSocketAddress) BlobCache(org.apache.flink.runtime.blob.BlobCache) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) IOException(java.io.IOException) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) RpcPartitionStateChecker(org.apache.flink.runtime.taskexecutor.rpc.RpcPartitionStateChecker) RpcResultPartitionConsumableNotifier(org.apache.flink.runtime.taskexecutor.rpc.RpcResultPartitionConsumableNotifier) PartitionProducerStateChecker(org.apache.flink.runtime.io.network.netty.PartitionProducerStateChecker) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) RpcResultPartitionConsumableNotifier(org.apache.flink.runtime.taskexecutor.rpc.RpcResultPartitionConsumableNotifier)

Aggregations

BlobLibraryCacheManager (org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager)7 BlobServer (org.apache.flink.runtime.blob.BlobServer)3 FiniteDuration (scala.concurrent.duration.FiniteDuration)3 ActorRef (akka.actor.ActorRef)2 Props (akka.actor.Props)2 UUID (java.util.UUID)2 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)2 Configuration (org.apache.flink.configuration.Configuration)2 CheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory)2 LibraryCacheManager (org.apache.flink.runtime.execution.librarycache.LibraryCacheManager)2 InstanceManager (org.apache.flink.runtime.instance.InstanceManager)2 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)2 TestingLeaderElectionService (org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)2 Test (org.junit.Test)2 Deadline (scala.concurrent.duration.Deadline)2 Identify (akka.actor.Identify)1 IOException (java.io.IOException)1 InetSocketAddress (java.net.InetSocketAddress)1 ArrayList (java.util.ArrayList)1 ExecutorService (java.util.concurrent.ExecutorService)1