use of org.apache.flink.runtime.blob.BlobServer in project flink by apache.
the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.
/**
* Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
* loses its leadership. Furthermore, it tests that the job manager can recover the job from
* the SubmittedJobGraphStore and checkpoint state is recovered as well.
*/
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
int slots = 2;
ActorRef archive = null;
ActorRef jobManager = null;
ActorRef taskManager = null;
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
MyCheckpointStore checkpointStore = new MyCheckpointStore();
CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
InstanceManager instanceManager = new InstanceManager();
instanceManager.addInstanceListener(scheduler);
archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
jobManager = system.actorOf(jobManagerProps);
ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
Await.ready(tmAlive, deadline.timeLeft());
JobVertex sourceJobVertex = new JobVertex("Source");
sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceJobVertex.setParallelism(slots);
JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
BlockingStatefulInvokable.initializeStaticHelpers(slots);
Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
// tell jobManager that he's the leader
myLeaderElectionService.isLeader(leaderSessionID);
// tell taskManager who's the leader
myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
Await.ready(isLeader, deadline.timeLeft());
Await.ready(isConnectedToJobManager, deadline.timeLeft());
// submit blocking job
Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
Await.ready(jobSubmitted, deadline.timeLeft());
// Wait for some checkpoints to complete
BlockingStatefulInvokable.awaitCompletedCheckpoints();
Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
// Revoke leadership
myLeaderElectionService.notLeader();
// check that the job gets removed from the JobManager
Await.ready(jobRemoved, deadline.timeLeft());
// but stays in the submitted job graph store
assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
// Make JobManager again a leader
myLeaderElectionService.isLeader(newLeaderSessionID);
// tell the TaskManager about it
myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
// wait that the job is recovered and reaches state RUNNING
Await.ready(jobRunning, deadline.timeLeft());
Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
BlockingInvokable.unblock();
// wait til the job has finished
Await.ready(jobFinished, deadline.timeLeft());
// check that the job has been removed from the submitted job graph store
assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
// Check that state has been recovered
long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
for (long state : recoveredStates) {
boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
}
} finally {
if (archive != null) {
archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of org.apache.flink.runtime.blob.BlobServer in project flink by apache.
the class JobManagerLeaderElectionTest method createJobManagerProps.
private Props createJobManagerProps(Configuration configuration) throws Exception {
LeaderElectionService leaderElectionService;
if (HighAvailabilityMode.fromConfig(configuration) == HighAvailabilityMode.NONE) {
leaderElectionService = new StandaloneLeaderElectionService();
} else {
CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration);
leaderElectionService = ZooKeeperUtils.createLeaderElectionService(client, configuration);
}
// We don't need recovery in this test
SubmittedJobGraphStore submittedJobGraphStore = new StandaloneSubmittedJobGraphStore();
CheckpointRecoveryFactory checkpointRecoveryFactory = new StandaloneCheckpointRecoveryFactory();
return Props.create(TestingJobManager.class, configuration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new InstanceManager(), new Scheduler(TestingUtils.defaultExecutionContext()), new BlobLibraryCacheManager(new BlobServer(configuration), 10L), ActorRef.noSender(), new NoRestartStrategy.NoRestartStrategyFactory(), AkkaUtils.getDefaultTimeoutAsFiniteDuration(), leaderElectionService, submittedJobGraphStore, checkpointRecoveryFactory, AkkaUtils.getDefaultTimeoutAsFiniteDuration(), Option.apply(null));
}
use of org.apache.flink.runtime.blob.BlobServer in project flink by apache.
the class MiniDispatcherTest method setupClass.
@BeforeClass
public static void setupClass() throws IOException {
jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
executionGraphInfo = new ExecutionGraphInfo(new ArchivedExecutionGraphBuilder().setJobID(jobGraph.getJobID()).setState(JobStatus.FINISHED).build());
rpcService = new TestingRpcService();
configuration = new Configuration();
blobServer = new BlobServer(configuration, temporaryFolder.newFolder(), new VoidBlobStore());
}
use of org.apache.flink.runtime.blob.BlobServer in project flink by apache.
the class BlobLibraryCacheManagerTest method testLibraryCacheManagerCleanup.
/**
* Tests that the {@link BlobLibraryCacheManager} cleans up after all class loader leases for a
* single job a closed.
*/
@Test
public void testLibraryCacheManagerCleanup() throws Exception {
JobID jobId = new JobID();
List<PermanentBlobKey> keys = new ArrayList<>();
BlobServer server = null;
PermanentBlobCache cache = null;
BlobLibraryCacheManager libCache = null;
final byte[] buf = new byte[128];
try {
Configuration config = new Configuration();
config.setLong(BlobServerOptions.CLEANUP_INTERVAL, 1L);
server = new BlobServer(config, temporaryFolder.newFolder(), new VoidBlobStore());
server.start();
InetSocketAddress serverAddress = new InetSocketAddress("localhost", server.getPort());
cache = new PermanentBlobCache(config, temporaryFolder.newFolder(), new VoidBlobStore(), serverAddress);
keys.add(server.putPermanent(jobId, buf));
buf[0] += 1;
keys.add(server.putPermanent(jobId, buf));
libCache = createBlobLibraryCacheManager(cache);
cache.registerJob(jobId);
assertEquals(0, libCache.getNumberOfManagedJobs());
assertEquals(0, libCache.getNumberOfReferenceHolders(jobId));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(0, jobId, cache);
final LibraryCacheManager.ClassLoaderLease classLoaderLease1 = libCache.registerClassLoaderLease(jobId);
UserCodeClassLoader classLoader1 = classLoaderLease1.getOrResolveClassLoader(keys, Collections.emptyList());
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(1, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(2, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(2, jobId, cache);
final LibraryCacheManager.ClassLoaderLease classLoaderLease2 = libCache.registerClassLoaderLease(jobId);
final UserCodeClassLoader classLoader2 = classLoaderLease2.getOrResolveClassLoader(keys, Collections.emptyList());
assertThat(classLoader1, sameInstance(classLoader2));
try {
classLoaderLease1.getOrResolveClassLoader(Collections.emptyList(), Collections.emptyList());
fail("Should fail with an IllegalStateException");
} catch (IllegalStateException e) {
// that's what we want
}
try {
classLoaderLease1.getOrResolveClassLoader(keys, Collections.singletonList(new URL("file:///tmp/does-not-exist")));
fail("Should fail with an IllegalStateException");
} catch (IllegalStateException e) {
// that's what we want
}
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(2, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(2, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(2, jobId, cache);
classLoaderLease1.release();
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(1, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(2, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(2, jobId, cache);
classLoaderLease2.release();
assertEquals(0, libCache.getNumberOfManagedJobs());
assertEquals(0, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(2, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(2, jobId, cache);
// only PermanentBlobCache#releaseJob() calls clean up files (tested in
// BlobCacheCleanupTest etc.
} finally {
if (libCache != null) {
libCache.shutdown();
}
// should have been closed by the libraryCacheManager, but just in case
if (cache != null) {
cache.close();
}
if (server != null) {
server.close();
}
}
}
use of org.apache.flink.runtime.blob.BlobServer in project flink by apache.
the class BlobLibraryCacheManagerTest method testRegisterAndDownload.
@Test
public void testRegisterAndDownload() throws IOException {
// setWritable doesn't work on Windows.
assumeTrue(!OperatingSystem.isWindows());
JobID jobId = new JobID();
BlobServer server = null;
PermanentBlobCache cache = null;
BlobLibraryCacheManager libCache = null;
File cacheDir = null;
try {
// create the blob transfer services
Configuration config = new Configuration();
config.setLong(BlobServerOptions.CLEANUP_INTERVAL, 1_000_000L);
server = new BlobServer(config, temporaryFolder.newFolder(), new VoidBlobStore());
server.start();
InetSocketAddress serverAddress = new InetSocketAddress("localhost", server.getPort());
cache = new PermanentBlobCache(config, temporaryFolder.newFolder(), new VoidBlobStore(), serverAddress);
// upload some meaningless data to the server
PermanentBlobKey dataKey1 = server.putPermanent(jobId, new byte[] { 1, 2, 3, 4, 5, 6, 7, 8 });
PermanentBlobKey dataKey2 = server.putPermanent(jobId, new byte[] { 11, 12, 13, 14, 15, 16, 17, 18 });
libCache = createBlobLibraryCacheManager(cache);
assertEquals(0, libCache.getNumberOfManagedJobs());
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(0, jobId, cache);
// first try to access a non-existing entry
assertEquals(0, libCache.getNumberOfReferenceHolders(new JobID()));
// register some BLOBs as libraries
{
Collection<PermanentBlobKey> keys = Collections.singleton(dataKey1);
cache.registerJob(jobId);
final LibraryCacheManager.ClassLoaderLease classLoaderLease1 = libCache.registerClassLoaderLease(jobId);
final UserCodeClassLoader classLoader1 = classLoaderLease1.getOrResolveClassLoader(keys, Collections.emptyList());
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(1, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(1, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(1, jobId, cache);
final LibraryCacheManager.ClassLoaderLease classLoaderLease2 = libCache.registerClassLoaderLease(jobId);
final UserCodeClassLoader classLoader2 = classLoaderLease2.getOrResolveClassLoader(keys, Collections.emptyList());
assertThat(classLoader1, sameInstance(classLoader2));
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(2, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(1, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(1, jobId, cache);
// un-register the job
classLoaderLease1.release();
// still one task
assertEquals(1, libCache.getNumberOfManagedJobs());
assertEquals(1, libCache.getNumberOfReferenceHolders(jobId));
assertEquals(1, checkFilesExist(jobId, keys, cache, true));
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(1, jobId, cache);
// unregister the task registration
classLoaderLease2.release();
assertEquals(0, libCache.getNumberOfManagedJobs());
assertEquals(0, libCache.getNumberOfReferenceHolders(jobId));
// changing the libCache registration does not influence the BLOB stores...
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(1, jobId, cache);
cache.releaseJob(jobId);
// library is still cached (but not associated with job any more)
checkFileCountForJob(2, jobId, server);
checkFileCountForJob(1, jobId, cache);
}
// see BlobUtils for the directory layout
cacheDir = cache.getStorageLocation(jobId, new PermanentBlobKey()).getParentFile();
assertTrue(cacheDir.exists());
// make sure no further blobs can be downloaded by removing the write
// permissions from the directory
assertTrue("Could not remove write permissions from cache directory", cacheDir.setWritable(false, false));
// since we cannot download this library any more, this call should fail
try {
cache.registerJob(jobId);
final LibraryCacheManager.ClassLoaderLease classLoaderLease = libCache.registerClassLoaderLease(jobId);
classLoaderLease.getOrResolveClassLoader(Collections.singleton(dataKey2), Collections.emptyList());
fail("This should fail with an IOException");
} catch (IOException e) {
// splendid!
cache.releaseJob(jobId);
}
} finally {
if (cacheDir != null) {
if (!cacheDir.setWritable(true, false)) {
System.err.println("Could not re-add write permissions to cache directory.");
}
}
if (cache != null) {
cache.close();
}
if (libCache != null) {
libCache.shutdown();
}
if (server != null) {
server.close();
}
}
}
Aggregations