use of akka.actor.ActorRef in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.
/**
* Tests that clients receive updates after recovery by a new leader.
*/
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Test actor system
ActorSystem testSystem = null;
// JobManager setup. Start the job managers as separate processes in order to not run the
// actors postStop, which cleans up all running jobs.
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Client test actor
TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
JobGraph jobGraph = createBlockingJobGraph();
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// The client
AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
int numSlots = 0;
while (numSlots == 0) {
Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
}
// Submit the job in non-detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
// Cancel the job
leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
}
// Wait for the execution result
clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
int jobSubmitSuccessMessages = 0;
for (Object msg : clientRef.underlyingActor().getMessages()) {
if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
jobSubmitSuccessMessages++;
}
}
// At least two submissions should be ack-ed (initial and recovery). This is quite
// conservative, but it is still possible that these messages are overtaken by the
// final message.
assertEquals(2, jobSubmitSuccessMessages);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.
@Test
public void testJobManagerProcessFailure() throws Exception {
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Setup
// Test actor system
ActorSystem testActorSystem;
// Job managers
final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
// Task managers
final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
// Leader election service
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Coordination directory
coordinateTempDir = createTempDirectory();
// Job Managers
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Start first process
jmProcess[0] = new JobManagerProcess(0, config);
jmProcess[0].startProcess();
// Task manager configuration
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
}
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
// Wait for all task managers to connect to the leading job manager
JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
//start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
jmProcess[0].destroy();
jmProcess[1] = new JobManagerProcess(1, config);
jmProcess[1].startProcess();
jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
// we create the marker file which signals the program functions tasks that they can complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (JobManagerProcess p : jmProcess) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (tmActorSystem[i] != null) {
tmActorSystem[i].shutdown();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (JobManagerProcess jmProces : jmProcess) {
if (jmProces != null) {
jmProces.destroy();
}
}
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.
@Test
public void testCancelingOnProcessFailure() {
final StringWriter processOutput = new StringWriter();
ActorSystem jmActorSystem = null;
Process taskManagerProcess = null;
try {
// check that we run this test only if the java command
// is available on this machine
String javaCommand = getJavaCommandPath();
if (javaCommand == null) {
System.out.println("---- Skipping Process Failure test : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile(getClass().getSimpleName() + "-", "-log4j.properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// find a free port to start the JobManager
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
Configuration jmConfig = new Configuration();
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "5 s");
jmConfig.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "2000 s");
jmConfig.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 10);
jmConfig.setString(ConfigConstants.AKKA_ASK_TIMEOUT, "100 s");
jmActorSystem = AkkaUtils.createActorSystem(jmConfig, new Some<>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(jmConfig, jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
// the TaskManager java command
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms80m", "-Xmx80m", "-classpath", getCurrentClasspath(), AbstractTaskManagerProcessFailureRecoveryTest.TaskManagerProcessEntryPoint.class.getName(), String.valueOf(jobManagerPort) };
// start the first two TaskManager processes
taskManagerProcess = new ProcessBuilder(command).start();
new CommonTestUtils.PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// we wait for the JobManager to have the two TaskManagers available
// since some of the CI environments are very hostile, we need to give this a lot of time (2 minutes)
waitUntilNumTaskManagersAreRegistered(jmActor, 1, 120000);
final Throwable[] errorRef = new Throwable[1];
// start the test program, which infinitely blocks
Runnable programRunner = new Runnable() {
@Override
public void run() {
try {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", jobManagerPort);
env.setParallelism(2);
env.setRestartStrategy(RestartStrategies.noRestart());
env.getConfig().disableSysoutLogging();
env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
synchronized (this) {
wait();
}
return 0L;
}
}).output(new DiscardingOutputFormat<Long>());
env.execute();
} catch (Throwable t) {
errorRef[0] = t;
}
}
};
Thread programThread = new Thread(programRunner);
// kill the TaskManager
taskManagerProcess.destroy();
taskManagerProcess = null;
// immediately submit the job. this should hit the case
// where the JobManager still thinks it has the TaskManager and tries to send it tasks
programThread.start();
// try to cancel the job
cancelRunningJob(jmActor);
// we should see a failure within reasonable time (10s is the ask timeout).
// since the CI environment is often slow, we conservatively give it up to 2 minutes,
// to fail, which is much lower than the failure time given by the heartbeats ( > 2000s)
programThread.join(120000);
assertFalse("The program did not cancel in time (2 minutes)", programThread.isAlive());
Throwable error = errorRef[0];
assertNotNull("The program did not fail properly", error);
assertTrue(error instanceof ProgramInvocationException);
// all seems well :-)
} catch (Exception e) {
e.printStackTrace();
printProcessLog("TaskManager", processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog("TaskManager 1", processOutput.toString());
throw e;
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class ChaosMonkeyITCase method submitJobGraph.
// - Utilities ---------------------------------------------------------------------------------
private void submitJobGraph(JobGraph jobGraph, JobManagerProcess jobManager, TestingListener leaderListener, ActorSystem actorSystem, FiniteDuration timeout) throws Exception {
ActorRef jobManagerRef = jobManager.getActorRef(actorSystem, timeout);
UUID jobManagerLeaderId = leaderListener.getLeaderSessionID();
AkkaActorGateway jobManagerGateway = new AkkaActorGateway(jobManagerRef, jobManagerLeaderId);
jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
}
use of akka.actor.ActorRef in project flink by apache.
the class ChaosMonkeyITCase method waitForJobRemoved.
private void waitForJobRemoved(JobID jobId, JobManagerProcess jobManager, ActorSystem actorSystem, FiniteDuration timeout) throws Exception {
ActorRef jobManagerRef = jobManager.getActorRef(actorSystem, timeout);
AkkaActorGateway jobManagerGateway = new AkkaActorGateway(jobManagerRef, null);
Future<Object> archiveFuture = jobManagerGateway.ask(JobManagerMessages.getRequestArchive(), timeout);
ActorRef archive = ((JobManagerMessages.ResponseArchive) Await.result(archiveFuture, timeout)).actor();
AkkaActorGateway archiveGateway = new AkkaActorGateway(archive, null);
Deadline deadline = timeout.fromNow();
while (deadline.hasTimeLeft()) {
JobManagerMessages.JobStatusResponse resp = JobManagerActorTestUtils.requestJobStatus(jobId, archiveGateway, deadline.timeLeft());
if (resp instanceof JobManagerMessages.JobNotFound) {
Thread.sleep(100);
} else {
return;
}
}
}
Aggregations