use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class JobClientActorRecoveryITCase method testJobClientRecovery.
/**
* Tests wether the JobClientActor can connect to a newly elected leading job manager to obtain
* the JobExecutionResult. The submitted job blocks for the first execution attempt. The
* leading job manager will be killed so that the second job manager will be elected as the
* leader. The newly elected leader has to retrieve the checkpointed job from ZooKeeper
* and continue its execution. This time, the job does not block and, thus, can be finished.
* The execution result should be sent to the JobClientActor which originally submitted the
* job.
*
* @throws Exception
*/
@Test
public void testJobClientRecovery() throws Exception {
File rootFolder = tempFolder.getRoot();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
final TestingCluster cluster = new TestingCluster(config);
cluster.start();
JobVertex blockingVertex = new JobVertex("Blocking Vertex");
blockingVertex.setInvokableClass(BlockingTask.class);
blockingVertex.setParallelism(1);
final JobGraph jobGraph = new JobGraph("Blocking Test Job", blockingVertex);
final Promise<JobExecutionResult> promise = new scala.concurrent.impl.Promise.DefaultPromise<>();
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
try {
Thread submitter = new Thread(new Runnable() {
@Override
public void run() {
try {
JobExecutionResult result = cluster.submitJobAndWait(jobGraph, false);
promise.success(result);
} catch (Exception e) {
promise.failure(e);
}
}
});
submitter.start();
synchronized (BlockingTask.waitLock) {
while (BlockingTask.HasBlockedExecution < 1 && deadline.hasTimeLeft()) {
BlockingTask.waitLock.wait(deadline.timeLeft().toMillis());
}
}
if (deadline.isOverdue()) {
Assert.fail("The job has not blocked within the given deadline.");
}
ActorGateway gateway = cluster.getLeaderGateway(deadline.timeLeft());
gateway.tell(TestingJobManagerMessages.getDisablePostStop());
gateway.tell(PoisonPill.getInstance());
// if the job fails then an exception is thrown here
Await.result(promise.future(), deadline.timeLeft());
} finally {
cluster.shutdown();
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class ClusterShutdownITCase method testClusterShutdownWithoutResourceManager.
/**
* Tests a faked cluster shutdown procedure without the ResourceManager.
*/
@Test
public void testClusterShutdownWithoutResourceManager() {
new JavaTestKit(system) {
{
new Within(duration("30 seconds")) {
@Override
protected void run() {
ActorGateway me = TestingUtils.createForwardingActor(system, getTestActor(), Option.<String>empty());
// start job manager which doesn't shutdown the actor system
ActorGateway jobManager = TestingUtils.createJobManager(system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), config, "jobmanager1");
// Tell the JobManager to inform us of shutdown actions
jobManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// Register a TaskManager
ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, config, true, true);
// Tell the TaskManager to inform us of TaskManager shutdowns
taskManager.tell(TestingMessages.getNotifyOfComponentShutdown(), me);
// No resource manager connected
jobManager.tell(new StopCluster(ApplicationStatus.SUCCEEDED, "Shutting down."), me);
expectMsgAllOf(new TestingMessages.ComponentShutdown(taskManager.actor()), new TestingMessages.ComponentShutdown(jobManager.actor()), StopClusterSuccessful.getInstance());
}
};
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class ResourceManagerITCase method testResourceManagerTaskManagerRegistration.
/**
* Tests whether the resource manager gets informed upon TaskManager registration.
*/
@Test
public void testResourceManagerTaskManagerRegistration() {
new JavaTestKit(system) {
{
new Within(duration("30 seconds")) {
@Override
protected void run() {
ActorGateway jobManager = TestingUtils.createJobManager(system, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), config, "RegTest");
ActorGateway me = TestingUtils.createForwardingActor(system, getTestActor(), Option.<String>empty());
// start the resource manager
ActorGateway resourceManager = TestingUtils.createResourceManager(system, jobManager.actor(), config);
// notify about a resource manager registration at the job manager
resourceManager.tell(new TestingResourceManager.NotifyWhenResourceManagerConnected(), me);
// Wait for resource manager
expectMsgEquals(Acknowledge.get());
// start task manager and wait for registration
ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager.actor(), config, true, true);
// check if we registered the task manager resource
resourceManager.tell(new TestingResourceManager.GetRegisteredResources(), me);
TestingResourceManager.GetRegisteredResourcesReply reply = expectMsgClass(TestingResourceManager.GetRegisteredResourcesReply.class);
assertEquals(1, reply.resources.size());
}
};
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class BackPressureStatsTrackerITCase method testBackPressuredProducer.
/**
* Tests a simple fake-back pressured task. Back pressure is assumed when
* sampled stack traces are in blocking buffer requests.
*/
@Test
public void testBackPressuredProducer() throws Exception {
new JavaTestKit(testActorSystem) {
{
final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
// The JobGraph
final JobGraph jobGraph = new JobGraph();
final int parallelism = 4;
final JobVertex task = new JobVertex("Task");
task.setInvokableClass(BackPressuredTask.class);
task.setParallelism(parallelism);
jobGraph.addVertex(task);
ActorGateway jobManger = null;
ActorGateway taskManager = null;
//
// 1) Consume all buffers at first (no buffers for the test task)
//
testBufferPool = networkBufferPool.createBufferPool(1, Integer.MAX_VALUE);
final List<Buffer> buffers = new ArrayList<>();
while (true) {
Buffer buffer = testBufferPool.requestBuffer();
if (buffer != null) {
buffers.add(buffer);
} else {
break;
}
}
try {
jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
final ActorGateway jm = jobManger;
new Within(deadline) {
@Override
protected void run() {
try {
ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
// Submit the job and wait until it is running
JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
// Get the ExecutionGraph
jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
// Verify back pressure (clean up interval can be ignored)
BackPressureStatsTracker statsTracker = new BackPressureStatsTracker(coordinator, 100 * 1000, 20, Time.milliseconds(10L));
int numAttempts = 10;
int nextSampleId = 0;
// the buffer.
for (int attempt = 0; attempt < numAttempts; attempt++) {
try {
OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
assertEquals(nextSampleId + attempt, stats.getSampleId());
assertEquals(parallelism, stats.getNumberOfSubTasks());
assertEquals(1.0, stats.getMaxBackPressureRatio(), 0.0);
for (int i = 0; i < parallelism; i++) {
assertEquals(1.0, stats.getBackPressureRatio(i), 0.0);
}
nextSampleId = stats.getSampleId() + 1;
break;
} catch (Throwable t) {
if (attempt == numAttempts - 1) {
throw t;
} else {
Thread.sleep(500);
}
}
}
//
for (Buffer buf : buffers) {
buf.recycle();
}
// grab them and then immediately release them.
while (testBufferPool.getNumberOfAvailableMemorySegments() < 100) {
Thread.sleep(100);
}
// Verify that no task is back pressured any more.
for (int attempt = 0; attempt < numAttempts; attempt++) {
try {
OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
assertEquals(nextSampleId + attempt, stats.getSampleId());
assertEquals(parallelism, stats.getNumberOfSubTasks());
// Verify that no task is back pressured
for (int i = 0; i < parallelism; i++) {
assertEquals(0.0, stats.getBackPressureRatio(i), 0.0);
}
break;
} catch (Throwable t) {
if (attempt == numAttempts - 1) {
throw t;
} else {
Thread.sleep(500);
}
}
}
// Shut down
jm.tell(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), testActor);
// Cancel job
jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
// Response to removal notification
expectMsgEquals(true);
//
// 3) Trigger stats for archived job
//
statsTracker.invalidateOperatorStatsCache();
assertFalse("Unexpected trigger", statsTracker.triggerStackTraceSample(vertex));
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} finally {
TestingUtils.stopActor(jobManger);
TestingUtils.stopActor(taskManager);
for (Buffer buf : buffers) {
buf.recycle();
}
testBufferPool.lazyDestroy();
}
}
};
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class JobManagerHARecoveryTest method testJobRecoveryWhenLosingLeadership.
/**
* Tests that the persisted job is not removed from the SubmittedJobGraphStore if the JobManager
* loses its leadership. Furthermore, it tests that the job manager can recover the job from
* the SubmittedJobGraphStore and checkpoint state is recovered as well.
*/
@Test
public void testJobRecoveryWhenLosingLeadership() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
FiniteDuration jobRecoveryTimeout = new FiniteDuration(3, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
UUID newLeaderSessionID = UUID.randomUUID();
int slots = 2;
ActorRef archive = null;
ActorRef jobManager = null;
ActorRef taskManager = null;
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().toString());
flinkConfiguration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, slots);
try {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
MySubmittedJobGraphStore mySubmittedJobGraphStore = new MySubmittedJobGraphStore();
MyCheckpointStore checkpointStore = new MyCheckpointStore();
CheckpointIDCounter checkpointCounter = new StandaloneCheckpointIDCounter();
CheckpointRecoveryFactory checkpointStateFactory = new MyCheckpointRecoveryFactory(checkpointStore, checkpointCounter);
TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
TestingLeaderRetrievalService myLeaderRetrievalService = new TestingLeaderRetrievalService();
InstanceManager instanceManager = new InstanceManager();
instanceManager.addInstanceListener(scheduler);
archive = system.actorOf(Props.create(MemoryArchivist.class, 10));
Props jobManagerProps = Props.create(TestingJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), instanceManager, scheduler, new BlobLibraryCacheManager(new BlobServer(flinkConfiguration), 3600000), archive, new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, mySubmittedJobGraphStore, checkpointStateFactory, jobRecoveryTimeout, Option.apply(null));
jobManager = system.actorOf(jobManagerProps);
ActorGateway gateway = new AkkaActorGateway(jobManager, leaderSessionID);
taskManager = TaskManager.startTaskManagerComponentsAndActor(flinkConfiguration, ResourceID.generate(), system, "localhost", Option.apply("taskmanager"), Option.apply((LeaderRetrievalService) myLeaderRetrievalService), true, TestingTaskManager.class);
ActorGateway tmGateway = new AkkaActorGateway(taskManager, leaderSessionID);
Future<Object> tmAlive = tmGateway.ask(TestingMessages.getAlive(), deadline.timeLeft());
Await.ready(tmAlive, deadline.timeLeft());
JobVertex sourceJobVertex = new JobVertex("Source");
sourceJobVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceJobVertex.setParallelism(slots);
JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
List<JobVertexID> vertexId = Collections.singletonList(sourceJobVertex.getID());
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexId, vertexId, vertexId, 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, true));
BlockingStatefulInvokable.initializeStaticHelpers(slots);
Future<Object> isLeader = gateway.ask(TestingJobManagerMessages.getNotifyWhenLeader(), deadline.timeLeft());
Future<Object> isConnectedToJobManager = tmGateway.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager), deadline.timeLeft());
// tell jobManager that he's the leader
myLeaderElectionService.isLeader(leaderSessionID);
// tell taskManager who's the leader
myLeaderRetrievalService.notifyListener(gateway.path(), leaderSessionID);
Await.ready(isLeader, deadline.timeLeft());
Await.ready(isConnectedToJobManager, deadline.timeLeft());
// submit blocking job
Future<Object> jobSubmitted = gateway.ask(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED), deadline.timeLeft());
Await.ready(jobSubmitted, deadline.timeLeft());
// Wait for some checkpoints to complete
BlockingStatefulInvokable.awaitCompletedCheckpoints();
Future<Object> jobRemoved = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
// Revoke leadership
myLeaderElectionService.notLeader();
// check that the job gets removed from the JobManager
Await.ready(jobRemoved, deadline.timeLeft());
// but stays in the submitted job graph store
assertTrue(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
Future<Object> jobRunning = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.RUNNING), deadline.timeLeft());
// Make JobManager again a leader
myLeaderElectionService.isLeader(newLeaderSessionID);
// tell the TaskManager about it
myLeaderRetrievalService.notifyListener(gateway.path(), newLeaderSessionID);
// wait that the job is recovered and reaches state RUNNING
Await.ready(jobRunning, deadline.timeLeft());
Future<Object> jobFinished = gateway.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
BlockingInvokable.unblock();
// wait til the job has finished
Await.ready(jobFinished, deadline.timeLeft());
// check that the job has been removed from the submitted job graph store
assertFalse(mySubmittedJobGraphStore.contains(jobGraph.getJobID()));
// Check that state has been recovered
long[] recoveredStates = BlockingStatefulInvokable.getRecoveredStates();
for (long state : recoveredStates) {
boolean isExpected = state >= BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE;
assertTrue("Did not recover checkpoint state correctly, expecting >= " + BlockingStatefulInvokable.NUM_CHECKPOINTS_TO_COMPLETE + ", but state was " + state, isExpected);
}
} finally {
if (archive != null) {
archive.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
Aggregations