use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class JobManagerHARecoveryTest method testFailingJobRecovery.
/**
* Tests that a failing job recovery won't cause other job recoveries to fail.
*/
@Test
public void testFailingJobRecovery() throws Exception {
final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
final FiniteDuration jobRecoveryTimeout = new FiniteDuration(0, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(1, TimeUnit.MINUTES).fromNow();
final Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = null;
JobID jobId1 = new JobID();
JobID jobId2 = new JobID();
// set HA mode to zookeeper so that we try to recover jobs
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
try {
final SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
SubmittedJobGraph submittedJobGraph = mock(SubmittedJobGraph.class);
when(submittedJobGraph.getJobId()).thenReturn(jobId2);
when(submittedJobGraphStore.getJobIds()).thenReturn(Arrays.asList(jobId1, jobId2));
// fail the first job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId1))).thenThrow(new Exception("Test exception"));
// succeed the second job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId2))).thenReturn(submittedJobGraph);
final TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
final Collection<JobID> recoveredJobs = new ArrayList<>(2);
Props jobManagerProps = Props.create(TestingFailingHAJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), mock(InstanceManager.class), mock(Scheduler.class), new BlobLibraryCacheManager(mock(BlobService.class), 1 << 20), ActorRef.noSender(), new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, submittedJobGraphStore, mock(CheckpointRecoveryFactory.class), jobRecoveryTimeout, Option.<MetricRegistry>apply(null), recoveredJobs).withDispatcher(CallingThreadDispatcher.Id());
jobManager = system.actorOf(jobManagerProps);
Future<Object> started = Patterns.ask(jobManager, new Identify(42), deadline.timeLeft().toMillis());
Await.ready(started, deadline.timeLeft());
// make the job manager the leader --> this triggers the recovery of all jobs
myLeaderElectionService.isLeader(leaderSessionID);
// check that we have successfully recovered the second job
assertThat(recoveredJobs, containsInAnyOrder(jobId2));
} finally {
TestingUtils.stopActor(jobManager);
}
}
use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class TaskManagerServices method fromConfiguration.
// --------------------------------------------------------------------------------------------
// Static factory methods for task manager services
// --------------------------------------------------------------------------------------------
/**
* Creates and returns the task manager services.
*
* @param taskManagerServicesConfiguration task manager configuration
* @param permanentBlobService permanentBlobService used by the services
* @param taskManagerMetricGroup metric group of the task manager
* @param ioExecutor executor for async IO operations
* @param fatalErrorHandler to handle class loading OOMs
* @param workingDirectory the working directory of the process
* @return task manager components
* @throws Exception
*/
public static TaskManagerServices fromConfiguration(TaskManagerServicesConfiguration taskManagerServicesConfiguration, PermanentBlobService permanentBlobService, MetricGroup taskManagerMetricGroup, ExecutorService ioExecutor, FatalErrorHandler fatalErrorHandler, WorkingDirectory workingDirectory) throws Exception {
// pre-start checks
checkTempDirs(taskManagerServicesConfiguration.getTmpDirPaths());
final TaskEventDispatcher taskEventDispatcher = new TaskEventDispatcher();
// start the I/O manager, it will create some temp directories.
final IOManager ioManager = new IOManagerAsync(taskManagerServicesConfiguration.getTmpDirPaths());
final ShuffleEnvironment<?, ?> shuffleEnvironment = createShuffleEnvironment(taskManagerServicesConfiguration, taskEventDispatcher, taskManagerMetricGroup, ioExecutor);
final int listeningDataPort = shuffleEnvironment.start();
final KvStateService kvStateService = KvStateService.fromConfiguration(taskManagerServicesConfiguration);
kvStateService.start();
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new UnresolvedTaskManagerLocation(taskManagerServicesConfiguration.getResourceID(), taskManagerServicesConfiguration.getExternalAddress(), // iff the external data port is not explicitly defined
taskManagerServicesConfiguration.getExternalDataPort() > 0 ? taskManagerServicesConfiguration.getExternalDataPort() : listeningDataPort);
final BroadcastVariableManager broadcastVariableManager = new BroadcastVariableManager();
final TaskSlotTable<Task> taskSlotTable = createTaskSlotTable(taskManagerServicesConfiguration.getNumberOfSlots(), taskManagerServicesConfiguration.getTaskExecutorResourceSpec(), taskManagerServicesConfiguration.getTimerServiceShutdownTimeout(), taskManagerServicesConfiguration.getPageSize(), ioExecutor);
final JobTable jobTable = DefaultJobTable.create();
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, taskManagerServicesConfiguration.getRetryingRegistrationConfiguration());
final TaskExecutorLocalStateStoresManager taskStateManager = new TaskExecutorLocalStateStoresManager(taskManagerServicesConfiguration.isLocalRecoveryEnabled(), taskManagerServicesConfiguration.getLocalRecoveryStateDirectories(), ioExecutor);
final TaskExecutorStateChangelogStoragesManager changelogStoragesManager = new TaskExecutorStateChangelogStoragesManager();
final boolean failOnJvmMetaspaceOomError = taskManagerServicesConfiguration.getConfiguration().getBoolean(CoreOptions.FAIL_ON_USER_CLASS_LOADING_METASPACE_OOM);
final boolean checkClassLoaderLeak = taskManagerServicesConfiguration.getConfiguration().getBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER);
final LibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(permanentBlobService, BlobLibraryCacheManager.defaultClassLoaderFactory(taskManagerServicesConfiguration.getClassLoaderResolveOrder(), taskManagerServicesConfiguration.getAlwaysParentFirstLoaderPatterns(), failOnJvmMetaspaceOomError ? fatalErrorHandler : null, checkClassLoaderLeak));
final SlotAllocationSnapshotPersistenceService slotAllocationSnapshotPersistenceService;
if (taskManagerServicesConfiguration.isLocalRecoveryEnabled()) {
slotAllocationSnapshotPersistenceService = new FileSlotAllocationSnapshotPersistenceService(workingDirectory.getSlotAllocationSnapshotDirectory());
} else {
slotAllocationSnapshotPersistenceService = NoOpSlotAllocationSnapshotPersistenceService.INSTANCE;
}
return new TaskManagerServices(unresolvedTaskManagerLocation, taskManagerServicesConfiguration.getManagedMemorySize().getBytes(), ioManager, shuffleEnvironment, kvStateService, broadcastVariableManager, taskSlotTable, jobTable, jobLeaderService, taskStateManager, changelogStoragesManager, taskEventDispatcher, ioExecutor, libraryCacheManager, slotAllocationSnapshotPersistenceService);
}
Aggregations