use of akka.actor.ActorRef in project flink by apache.
the class JobManagerTest method testCancelWithSavepointNoDirectoriesConfigured.
/**
* Tests that a meaningful exception is returned if no savepoint directory is
* configured.
*/
@Test
public void testCancelWithSavepointNoDirectoriesConfigured() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
Configuration config = new Configuration();
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), 3600000, 3600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Cancel with savepoint
msg = new JobManagerMessages.CancelJobWithSavepoint(jobGraph.getJobID(), null);
CancellationResponse cancelResp = (CancellationResponse) Await.result(jobManager.ask(msg, timeout), timeout);
if (cancelResp instanceof CancellationFailure) {
CancellationFailure failure = (CancellationFailure) cancelResp;
assertTrue(failure.cause() instanceof IllegalStateException);
assertTrue(failure.cause().getMessage().contains("savepoint directory"));
} else {
fail("Unexpected cancellation response from JobManager: " + cancelResp);
}
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class JobManagerHARecoveryTest method testFailingJobRecovery.
/**
* Tests that a failing job recovery won't cause other job recoveries to fail.
*/
@Test
public void testFailingJobRecovery() throws Exception {
final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
final FiniteDuration jobRecoveryTimeout = new FiniteDuration(0, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(1, TimeUnit.MINUTES).fromNow();
final Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = null;
JobID jobId1 = new JobID();
JobID jobId2 = new JobID();
// set HA mode to zookeeper so that we try to recover jobs
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
try {
final SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
SubmittedJobGraph submittedJobGraph = mock(SubmittedJobGraph.class);
when(submittedJobGraph.getJobId()).thenReturn(jobId2);
when(submittedJobGraphStore.getJobIds()).thenReturn(Arrays.asList(jobId1, jobId2));
// fail the first job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId1))).thenThrow(new Exception("Test exception"));
// succeed the second job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId2))).thenReturn(submittedJobGraph);
final TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
final Collection<JobID> recoveredJobs = new ArrayList<>(2);
Props jobManagerProps = Props.create(TestingFailingHAJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), mock(InstanceManager.class), mock(Scheduler.class), new BlobLibraryCacheManager(mock(BlobService.class), 1 << 20), ActorRef.noSender(), new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, submittedJobGraphStore, mock(CheckpointRecoveryFactory.class), jobRecoveryTimeout, Option.<MetricRegistry>apply(null), recoveredJobs).withDispatcher(CallingThreadDispatcher.Id());
jobManager = system.actorOf(jobManagerProps);
Future<Object> started = Patterns.ask(jobManager, new Identify(42), deadline.timeLeft().toMillis());
Await.ready(started, deadline.timeLeft());
// make the job manager the leader --> this triggers the recovery of all jobs
myLeaderElectionService.isLeader(leaderSessionID);
// check that we have successfully recovered the second job
assertThat(recoveredJobs, containsInAnyOrder(jobId2));
} finally {
TestingUtils.stopActor(jobManager);
}
}
use of akka.actor.ActorRef in project flink by apache.
the class JobManagerProcessReapingTest method testReapProcessOnFailure.
@Test
public void testReapProcessOnFailure() {
Process jmProcess = null;
ActorSystem localSystem = null;
final StringWriter processOutput = new StringWriter();
try {
String javaCommand = getJavaCommandPath();
// is available on this machine
if (javaCommand == null) {
System.out.println("---- Skipping JobManagerProcessReapingTest : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile("testlogconfig", "properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
// start a JobManger process
// the log level must be at least INFO, otherwise the bound port cannot be retrieved
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms256m", "-Xmx256m", "-classpath", getCurrentClasspath(), JobManagerTestEntryPoint.class.getName() };
// spawn the process and collect its output
ProcessBuilder bld = new ProcessBuilder(command);
jmProcess = bld.start();
new PipeForwarder(jmProcess.getErrorStream(), processOutput);
// start another actor system so we can send something to the JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", 0);
localSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<Tuple2<String, Object>>(localAddress));
// grab the reference to the JobManager. try multiple times, until the process
// is started and the JobManager is up
ActorRef jobManagerRef = null;
Throwable lastError = null;
// Log message on JobManager must be: Starting JobManager at ...://flink@...:port/..."
// otherwise, the pattern does not match and, thus, cannot retrieve the bound port
String pattern = "Starting JobManager at [^:]*://flink@[^:]*:(\\d*)/";
Pattern r = Pattern.compile(pattern);
int jobManagerPort = -1;
for (int i = 0; i < 40; i++) {
Matcher m = r.matcher(processOutput.toString());
if (m.find()) {
jobManagerPort = Integer.parseInt(m.group(1));
break;
}
Thread.sleep(500);
}
if (jobManagerPort != -1) {
try {
jobManagerRef = JobManager.getJobManagerActorRef("akka.tcp", NetUtils.unresolvedHostAndPortToNormalizedString("localhost", jobManagerPort), localSystem, new FiniteDuration(25, TimeUnit.SECONDS));
} catch (Throwable t) {
// job manager probably not ready yet
lastError = t;
}
} else {
fail("Could not determine port of started JobManager.");
}
assertTrue("JobManager process died", isProcessAlive(jmProcess));
if (jobManagerRef == null) {
if (lastError != null) {
lastError.printStackTrace();
}
fail("JobManager process did not launch the JobManager properly. Failed to look up JobManager actor at" + " localhost:" + jobManagerPort);
}
// kill the JobManager actor
jobManagerRef.tell(PoisonPill.getInstance(), ActorRef.noSender());
// wait for max 5 seconds for the process to terminate
{
long now = System.currentTimeMillis();
long deadline = now + 5000;
while (now < deadline && isProcessAlive(jmProcess)) {
Thread.sleep(100);
now = System.currentTimeMillis();
}
}
assertFalse("JobManager process did not terminate upon actor death", isProcessAlive(jmProcess));
int returnCode = jmProcess.exitValue();
assertEquals("JobManager died, but not because of the process reaper", JobManager.RUNTIME_FAILURE_RETURN_CODE(), returnCode);
} catch (Exception e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
throw e;
} finally {
if (jmProcess != null) {
jmProcess.destroy();
}
if (localSystem != null) {
localSystem.shutdown();
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class AkkaKvStateLocationLookupServiceTest method testRetryOnUnknownJobManager.
/**
* Tests that lookups are retried when no leader notification is available.
*/
@Test
public void testRetryOnUnknownJobManager() throws Exception {
final Queue<LookupRetryStrategy> retryStrategies = new LinkedBlockingQueue<>();
LookupRetryStrategyFactory retryStrategy = new LookupRetryStrategyFactory() {
@Override
public LookupRetryStrategy createRetryStrategy() {
return retryStrategies.poll();
}
};
final TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
AkkaKvStateLocationLookupService lookupService = new AkkaKvStateLocationLookupService(leaderRetrievalService, testActorSystem, TIMEOUT, retryStrategy);
lookupService.start();
//
// Test call to retry
//
final AtomicBoolean hasRetried = new AtomicBoolean();
retryStrategies.add(new LookupRetryStrategy() {
@Override
public FiniteDuration getRetryDelay() {
return FiniteDuration.Zero();
}
@Override
public boolean tryRetry() {
if (hasRetried.compareAndSet(false, true)) {
return true;
}
return false;
}
});
Future<KvStateLocation> locationFuture = lookupService.getKvStateLookupInfo(new JobID(), "yessir");
Await.ready(locationFuture, TIMEOUT);
assertTrue("Did not retry ", hasRetried.get());
//
// Test leader notification after retry
//
Queue<LookupKvStateLocation> received = new LinkedBlockingQueue<>();
KvStateLocation expected = new KvStateLocation(new JobID(), new JobVertexID(), 12122, "garlic");
ActorRef testActor = LookupResponseActor.create(received, null, expected);
final String testActorAddress = AkkaUtils.getAkkaURL(testActorSystem, testActor);
retryStrategies.add(new LookupRetryStrategy() {
@Override
public FiniteDuration getRetryDelay() {
return FiniteDuration.apply(100, TimeUnit.MILLISECONDS);
}
@Override
public boolean tryRetry() {
leaderRetrievalService.notifyListener(testActorAddress, null);
return true;
}
});
KvStateLocation location = Await.result(lookupService.getKvStateLookupInfo(new JobID(), "yessir"), TIMEOUT);
assertEquals(expected, location);
}
use of akka.actor.ActorRef in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.
/**
* Tests that the JobManager logs failures during recovery properly.
*
* @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
final Deadline testDeadline = TestTimeOut.fromNow();
final String zooKeeperQuorum = ZooKeeper.getConnectString();
final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
ActorSystem testActorSystem = null;
try {
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Get the leader
leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
JobManagerProcess nonLeadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
nonLeadingJobManagerProcess = jobManagerProcess[1];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
nonLeadingJobManagerProcess = jobManagerProcess[0];
}
// Blocking JobGraph
JobVertex blockingVertex = new JobVertex("Blocking vertex");
blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
JobGraph jobGraph = new JobGraph(blockingVertex);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to be running
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
// Remove all files
FileUtils.deleteDirectory(FileStateBackendBasePath);
// Kill the leader
leadingJobManagerProcess.destroy();
// Verify that the job manager logs the failed recovery. We can not
// do more at this point. :(
boolean success = false;
while (testDeadline.hasTimeLeft()) {
String output = nonLeadingJobManagerProcess.getProcessOutput();
if (output != null) {
if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
success = true;
break;
}
} else {
log.warn("No process output available.");
}
Thread.sleep(500);
}
assertTrue("Did not find expected output in logs.", success);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
Aggregations