Search in sources :

Example 6 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class JobManagerHARecoveryTest method testFailingJobRecovery.

/**
	 * Tests that a failing job recovery won't cause other job recoveries to fail.
	 */
@Test
public void testFailingJobRecovery() throws Exception {
    final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
    final FiniteDuration jobRecoveryTimeout = new FiniteDuration(0, TimeUnit.SECONDS);
    Deadline deadline = new FiniteDuration(1, TimeUnit.MINUTES).fromNow();
    final Configuration flinkConfiguration = new Configuration();
    UUID leaderSessionID = UUID.randomUUID();
    ActorRef jobManager = null;
    JobID jobId1 = new JobID();
    JobID jobId2 = new JobID();
    // set HA mode to zookeeper so that we try to recover jobs
    flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    try {
        final SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
        SubmittedJobGraph submittedJobGraph = mock(SubmittedJobGraph.class);
        when(submittedJobGraph.getJobId()).thenReturn(jobId2);
        when(submittedJobGraphStore.getJobIds()).thenReturn(Arrays.asList(jobId1, jobId2));
        // fail the first job recovery
        when(submittedJobGraphStore.recoverJobGraph(eq(jobId1))).thenThrow(new Exception("Test exception"));
        // succeed the second job recovery
        when(submittedJobGraphStore.recoverJobGraph(eq(jobId2))).thenReturn(submittedJobGraph);
        final TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
        final Collection<JobID> recoveredJobs = new ArrayList<>(2);
        Props jobManagerProps = Props.create(TestingFailingHAJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), mock(InstanceManager.class), mock(Scheduler.class), new BlobLibraryCacheManager(mock(BlobService.class), 1 << 20), ActorRef.noSender(), new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, submittedJobGraphStore, mock(CheckpointRecoveryFactory.class), jobRecoveryTimeout, Option.<MetricRegistry>apply(null), recoveredJobs).withDispatcher(CallingThreadDispatcher.Id());
        jobManager = system.actorOf(jobManagerProps);
        Future<Object> started = Patterns.ask(jobManager, new Identify(42), deadline.timeLeft().toMillis());
        Await.ready(started, deadline.timeLeft());
        // make the job manager the leader --> this triggers the recovery of all jobs
        myLeaderElectionService.isLeader(leaderSessionID);
        // check that we have successfully recovered the second job
        assertThat(recoveredJobs, containsInAnyOrder(jobId2));
    } finally {
        TestingUtils.stopActor(jobManager);
    }
}
Also used : BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) ArrayList(java.util.ArrayList) FiniteDuration(scala.concurrent.duration.FiniteDuration) Props(akka.actor.Props) Identify(akka.actor.Identify) BlobService(org.apache.flink.runtime.blob.BlobService) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 7 with BlobLibraryCacheManager

use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.

the class TaskManagerServices method fromConfiguration.

// --------------------------------------------------------------------------------------------
// Static factory methods for task manager services
// --------------------------------------------------------------------------------------------
/**
 * Creates and returns the task manager services.
 *
 * @param taskManagerServicesConfiguration task manager configuration
 * @param permanentBlobService permanentBlobService used by the services
 * @param taskManagerMetricGroup metric group of the task manager
 * @param ioExecutor executor for async IO operations
 * @param fatalErrorHandler to handle class loading OOMs
 * @param workingDirectory the working directory of the process
 * @return task manager components
 * @throws Exception
 */
public static TaskManagerServices fromConfiguration(TaskManagerServicesConfiguration taskManagerServicesConfiguration, PermanentBlobService permanentBlobService, MetricGroup taskManagerMetricGroup, ExecutorService ioExecutor, FatalErrorHandler fatalErrorHandler, WorkingDirectory workingDirectory) throws Exception {
    // pre-start checks
    checkTempDirs(taskManagerServicesConfiguration.getTmpDirPaths());
    final TaskEventDispatcher taskEventDispatcher = new TaskEventDispatcher();
    // start the I/O manager, it will create some temp directories.
    final IOManager ioManager = new IOManagerAsync(taskManagerServicesConfiguration.getTmpDirPaths());
    final ShuffleEnvironment<?, ?> shuffleEnvironment = createShuffleEnvironment(taskManagerServicesConfiguration, taskEventDispatcher, taskManagerMetricGroup, ioExecutor);
    final int listeningDataPort = shuffleEnvironment.start();
    final KvStateService kvStateService = KvStateService.fromConfiguration(taskManagerServicesConfiguration);
    kvStateService.start();
    final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new UnresolvedTaskManagerLocation(taskManagerServicesConfiguration.getResourceID(), taskManagerServicesConfiguration.getExternalAddress(), // iff the external data port is not explicitly defined
    taskManagerServicesConfiguration.getExternalDataPort() > 0 ? taskManagerServicesConfiguration.getExternalDataPort() : listeningDataPort);
    final BroadcastVariableManager broadcastVariableManager = new BroadcastVariableManager();
    final TaskSlotTable<Task> taskSlotTable = createTaskSlotTable(taskManagerServicesConfiguration.getNumberOfSlots(), taskManagerServicesConfiguration.getTaskExecutorResourceSpec(), taskManagerServicesConfiguration.getTimerServiceShutdownTimeout(), taskManagerServicesConfiguration.getPageSize(), ioExecutor);
    final JobTable jobTable = DefaultJobTable.create();
    final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, taskManagerServicesConfiguration.getRetryingRegistrationConfiguration());
    final TaskExecutorLocalStateStoresManager taskStateManager = new TaskExecutorLocalStateStoresManager(taskManagerServicesConfiguration.isLocalRecoveryEnabled(), taskManagerServicesConfiguration.getLocalRecoveryStateDirectories(), ioExecutor);
    final TaskExecutorStateChangelogStoragesManager changelogStoragesManager = new TaskExecutorStateChangelogStoragesManager();
    final boolean failOnJvmMetaspaceOomError = taskManagerServicesConfiguration.getConfiguration().getBoolean(CoreOptions.FAIL_ON_USER_CLASS_LOADING_METASPACE_OOM);
    final boolean checkClassLoaderLeak = taskManagerServicesConfiguration.getConfiguration().getBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER);
    final LibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(permanentBlobService, BlobLibraryCacheManager.defaultClassLoaderFactory(taskManagerServicesConfiguration.getClassLoaderResolveOrder(), taskManagerServicesConfiguration.getAlwaysParentFirstLoaderPatterns(), failOnJvmMetaspaceOomError ? fatalErrorHandler : null, checkClassLoaderLeak));
    final SlotAllocationSnapshotPersistenceService slotAllocationSnapshotPersistenceService;
    if (taskManagerServicesConfiguration.isLocalRecoveryEnabled()) {
        slotAllocationSnapshotPersistenceService = new FileSlotAllocationSnapshotPersistenceService(workingDirectory.getSlotAllocationSnapshotDirectory());
    } else {
        slotAllocationSnapshotPersistenceService = NoOpSlotAllocationSnapshotPersistenceService.INSTANCE;
    }
    return new TaskManagerServices(unresolvedTaskManagerLocation, taskManagerServicesConfiguration.getManagedMemorySize().getBytes(), ioManager, shuffleEnvironment, kvStateService, broadcastVariableManager, taskSlotTable, jobTable, jobLeaderService, taskStateManager, changelogStoragesManager, taskEventDispatcher, ioExecutor, libraryCacheManager, slotAllocationSnapshotPersistenceService);
}
Also used : BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) Task(org.apache.flink.runtime.taskmanager.Task) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) NoOpSlotAllocationSnapshotPersistenceService(org.apache.flink.runtime.taskexecutor.slot.NoOpSlotAllocationSnapshotPersistenceService) SlotAllocationSnapshotPersistenceService(org.apache.flink.runtime.taskexecutor.slot.SlotAllocationSnapshotPersistenceService) FileSlotAllocationSnapshotPersistenceService(org.apache.flink.runtime.taskexecutor.slot.FileSlotAllocationSnapshotPersistenceService) FileSlotAllocationSnapshotPersistenceService(org.apache.flink.runtime.taskexecutor.slot.FileSlotAllocationSnapshotPersistenceService) TaskExecutorLocalStateStoresManager(org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher) TaskExecutorStateChangelogStoragesManager(org.apache.flink.runtime.state.TaskExecutorStateChangelogStoragesManager)

Aggregations

BlobLibraryCacheManager (org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager)7 BlobServer (org.apache.flink.runtime.blob.BlobServer)3 FiniteDuration (scala.concurrent.duration.FiniteDuration)3 ActorRef (akka.actor.ActorRef)2 Props (akka.actor.Props)2 UUID (java.util.UUID)2 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)2 Configuration (org.apache.flink.configuration.Configuration)2 CheckpointRecoveryFactory (org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory)2 LibraryCacheManager (org.apache.flink.runtime.execution.librarycache.LibraryCacheManager)2 InstanceManager (org.apache.flink.runtime.instance.InstanceManager)2 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)2 TestingLeaderElectionService (org.apache.flink.runtime.leaderelection.TestingLeaderElectionService)2 Test (org.junit.Test)2 Deadline (scala.concurrent.duration.Deadline)2 Identify (akka.actor.Identify)1 IOException (java.io.IOException)1 InetSocketAddress (java.net.InetSocketAddress)1 ArrayList (java.util.ArrayList)1 ExecutorService (java.util.concurrent.ExecutorService)1