use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.
/**
* Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
* loses its leadership. Furthermore, it tests that the job manager can recover the job from
* the SubmittedJobGraphStore and checkpoint state is recovered as well.
*/
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
int slots = 2;
ActorRef archive = null;
ActorRef jobManager = null;
ActorRef taskManager = null;
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
MyCheckpointStore checkpointStore = new MyCheckpointStore();
CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
InstanceManager instanceManager = new InstanceManager();
instanceManager.addInstanceListener(scheduler);
archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
jobManager = system.actorOf(jobManagerProps);
ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
Await.ready(tmAlive, deadline.timeLeft());
JobVertex sourceJobVertex = new JobVertex("Source");
sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceJobVertex.setParallelism(slots);
JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
BlockingStatefulInvokable.initializeStaticHelpers(slots);
Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
// tell jobManager that he's the leader
myLeaderElectionService.isLeader(leaderSessionID);
// tell taskManager who's the leader
myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
Await.ready(isLeader, deadline.timeLeft());
Await.ready(isConnectedToJobManager, deadline.timeLeft());
// submit blocking job
Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
Await.ready(jobSubmitted, deadline.timeLeft());
// Wait for some checkpoints to complete
BlockingStatefulInvokable.awaitCompletedCheckpoints();
Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
// Revoke leadership
myLeaderElectionService.notLeader();
// check that the job gets removed from the JobManager
Await.ready(jobRemoved, deadline.timeLeft());
// but stays in the submitted job graph store
assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
// Make JobManager again a leader
myLeaderElectionService.isLeader(newLeaderSessionID);
// tell the TaskManager about it
myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
// wait that the job is recovered and reaches state RUNNING
Await.ready(jobRunning, deadline.timeLeft());
Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
BlockingInvokable.unblock();
// wait til the job has finished
Await.ready(jobFinished, deadline.timeLeft());
// check that the job has been removed from the submitted job graph store
assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
// Check that state has been recovered
long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
for (long state : recoveredStates) {
boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
}
} finally {
if (archive != null) {
archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class JobManagerLeaderElectionTest method createJobManagerProps.
private Props createJobManagerProps(Configuration configuration) throws Exception {
LeaderElectionService leaderElectionService;
if (HighAvailabilityMode.fromConfig(configuration) == HighAvailabilityMode.NONE) {
leaderElectionService = new StandaloneLeaderElectionService();
} else {
CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration);
leaderElectionService = ZooKeeperUtils.createLeaderElectionService(client, configuration);
}
// We don't need recovery in this test
SubmittedJobGraphStore submittedJobGraphStore = new StandaloneSubmittedJobGraphStore();
CheckpointRecoveryFactory checkpointRecoveryFactory = new StandaloneCheckpointRecoveryFactory();
return Props.create(TestingJobManager.class, configuration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new InstanceManager(), new Scheduler(TestingUtils.defaultExecutionContext()), new BlobLibraryCacheManager(new BlobServer(configuration), 10L), ActorRef.noSender(), new NoRestartStrategy.NoRestartStrategyFactory(), AkkaUtils.getDefaultTimeoutAsFiniteDuration(), leaderElectionService, submittedJobGraphStore, checkpointRecoveryFactory, AkkaUtils.getDefaultTimeoutAsFiniteDuration(), Option.apply(null));
}
use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class JobManagerSharedServices method fromConfiguration.
// ------------------------------------------------------------------------
// Creating the components from a configuration
// ------------------------------------------------------------------------
public static JobManagerSharedServices fromConfiguration(Configuration config, BlobServer blobServer, FatalErrorHandler fatalErrorHandler) throws Exception {
checkNotNull(config);
checkNotNull(blobServer);
final String classLoaderResolveOrder = config.getString(CoreOptions.CLASSLOADER_RESOLVE_ORDER);
final String[] alwaysParentFirstLoaderPatterns = CoreOptions.getParentFirstLoaderPatterns(config);
final boolean failOnJvmMetaspaceOomError = config.getBoolean(CoreOptions.FAIL_ON_USER_CLASS_LOADING_METASPACE_OOM);
final boolean checkClassLoaderLeak = config.getBoolean(CoreOptions.CHECK_LEAKED_CLASSLOADER);
final BlobLibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(blobServer, BlobLibraryCacheManager.defaultClassLoaderFactory(FlinkUserCodeClassLoaders.ResolveOrder.fromString(classLoaderResolveOrder), alwaysParentFirstLoaderPatterns, failOnJvmMetaspaceOomError ? fatalErrorHandler : null, checkClassLoaderLeak));
final int numberCPUCores = Hardware.getNumberCPUCores();
final int jobManagerFuturePoolSize = config.getInteger(JobManagerOptions.JOB_MANAGER_FUTURE_POOL_SIZE, numberCPUCores);
final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(jobManagerFuturePoolSize, new ExecutorThreadFactory("jobmanager-future"));
final int jobManagerIoPoolSize = config.getInteger(JobManagerOptions.JOB_MANAGER_IO_POOL_SIZE, numberCPUCores);
final ExecutorService ioExecutor = Executors.newFixedThreadPool(jobManagerIoPoolSize, new ExecutorThreadFactory("jobmanager-io"));
final ShuffleMasterContext shuffleMasterContext = new ShuffleMasterContextImpl(config, fatalErrorHandler);
final ShuffleMaster<?> shuffleMaster = ShuffleServiceLoader.loadShuffleServiceFactory(config).createShuffleMaster(shuffleMasterContext);
shuffleMaster.start();
return new JobManagerSharedServices(futureExecutor, ioExecutor, libraryCacheManager, shuffleMaster, blobServer);
}
use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class JobManagerServices method fromConfiguration.
// ------------------------------------------------------------------------
// Creating the components from a configuration
// ------------------------------------------------------------------------
public static JobManagerServices fromConfiguration(Configuration config, HighAvailabilityServices haServices) throws Exception {
final BlobServer blobServer = new BlobServer(config, haServices);
final long cleanupInterval = config.getLong(ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL, ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000;
final BlobLibraryCacheManager libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval);
final FiniteDuration timeout;
try {
timeout = AkkaUtils.getTimeout(config);
} catch (NumberFormatException e) {
throw new IllegalConfigurationException(AkkaUtils.formatDurationParingErrorMessage());
}
final ScheduledExecutorService futureExecutor = Executors.newScheduledThreadPool(Hardware.getNumberCPUCores(), new ExecutorThreadFactory("jobmanager-future"));
return new JobManagerServices(futureExecutor, libraryCacheManager, RestartStrategyFactory.createRestartStrategyFactory(config), Time.of(timeout.length(), timeout.unit()));
}
use of org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager in project flink by apache.
the class TaskExecutor method associateWithJobManager.
private JobManagerConnection associateWithJobManager(JobID jobID, ResourceID resourceID, JobMasterGateway jobMasterGateway, UUID jobManagerLeaderId, int blobPort) {
Preconditions.checkNotNull(jobID);
Preconditions.checkNotNull(resourceID);
Preconditions.checkNotNull(jobManagerLeaderId);
Preconditions.checkNotNull(jobMasterGateway);
Preconditions.checkArgument(blobPort > 0 || blobPort < MAX_BLOB_PORT, "Blob server port is out of range.");
TaskManagerActions taskManagerActions = new TaskManagerActionsImpl(jobManagerLeaderId, jobMasterGateway);
CheckpointResponder checkpointResponder = new RpcCheckpointResponder(jobMasterGateway);
InetSocketAddress blobServerAddress = new InetSocketAddress(jobMasterGateway.getHostname(), blobPort);
final LibraryCacheManager libraryCacheManager;
try {
final BlobCache blobCache = new BlobCache(blobServerAddress, taskManagerConfiguration.getConfiguration(), haServices);
libraryCacheManager = new BlobLibraryCacheManager(blobCache, taskManagerConfiguration.getCleanupInterval());
} catch (IOException e) {
// Can't pass the IOException up - we need a RuntimeException anyway
// two levels up where this is run asynchronously. Also, we don't
// know whether this is caught in the thread running this method.
final String message = "Could not create BLOB cache or library cache.";
log.error(message, e);
throw new RuntimeException(message, e);
}
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = new RpcResultPartitionConsumableNotifier(jobManagerLeaderId, jobMasterGateway, getRpcService().getExecutor(), taskManagerConfiguration.getTimeout());
PartitionProducerStateChecker partitionStateChecker = new RpcPartitionStateChecker(jobManagerLeaderId, jobMasterGateway);
return new JobManagerConnection(jobID, resourceID, jobMasterGateway, jobManagerLeaderId, taskManagerActions, checkpointResponder, libraryCacheManager, resultPartitionConsumableNotifier, partitionStateChecker);
}
Aggregations