use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.
the class AccumulatorLiveITCase method verifyResults.
private static void verifyResults() {
new JavaTestKit(system) {
{
ActorGateway selfGateway = new AkkaActorGateway(getRef(), jobManagerGateway.leaderSessionID());
// register for accumulator changes
jobManagerGateway.tell(new TestingJobManagerMessages.NotifyWhenAccumulatorChange(jobID), selfGateway);
expectMsgEquals(TIMEOUT, true);
// submit job
jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT), selfGateway);
expectMsgClass(TIMEOUT, JobManagerMessages.JobSubmitSuccess.class);
TestingJobManagerMessages.UpdatedAccumulators msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
Map<String, Accumulator<?, ?>> userAccumulators = msg.userAccumulators();
ExecutionAttemptID mapperTaskID = null;
ExecutionAttemptID sinkTaskID = null;
/* Check for accumulator values */
if (checkUserAccumulators(0, userAccumulators)) {
LOG.info("Passed initial check for map task.");
} else {
fail("Wrong accumulator results when map task begins execution.");
}
int expectedAccVal = 0;
/* for mapper task */
for (int i = 1; i <= NUM_ITERATIONS; i++) {
expectedAccVal += i;
// receive message
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
LOG.info("{}", userAccumulators);
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed round #" + i);
} else if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
// we determined the wrong task id and need to switch the two here
ExecutionAttemptID temp = mapperTaskID;
mapperTaskID = sinkTaskID;
sinkTaskID = temp;
LOG.info("Passed round #" + i);
} else {
fail("Failed in round #" + i);
}
}
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed initial check for sink task.");
} else {
fail("Wrong accumulator results when sink task begins execution.");
}
/* for sink task */
for (int i = 1; i <= NUM_ITERATIONS; i++) {
// receive message
msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
userAccumulators = msg.userAccumulators();
LOG.info("{}", userAccumulators);
if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
LOG.info("Passed round #" + i);
} else {
fail("Failed in round #" + i);
}
}
expectMsgClass(TIMEOUT, JobManagerMessages.JobResultSuccess.class);
}
};
}
use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.
the class UtilsTest method testYarnFlinkResourceManagerJobManagerLostLeadership.
@Test
public void testYarnFlinkResourceManagerJobManagerLostLeadership() throws Exception {
new JavaTestKit(system) {
{
final Deadline deadline = new FiniteDuration(3, TimeUnit.MINUTES).fromNow();
Configuration flinkConfig = new Configuration();
YarnConfiguration yarnConfig = new YarnConfiguration();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
String applicationMasterHostName = "localhost";
String webInterfaceURL = "foobar";
ContaineredTaskManagerParameters taskManagerParameters = new ContaineredTaskManagerParameters(1l, 1l, 1l, 1, new HashMap<String, String>());
ContainerLaunchContext taskManagerLaunchContext = mock(ContainerLaunchContext.class);
int yarnHeartbeatIntervalMillis = 1000;
int maxFailedContainers = 10;
int numInitialTaskManagers = 5;
final YarnResourceManagerCallbackHandler callbackHandler = new YarnResourceManagerCallbackHandler();
AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient = mock(AMRMClientAsync.class);
NMClient nodeManagerClient = mock(NMClient.class);
UUID leaderSessionID = UUID.randomUUID();
final List<Container> containerList = new ArrayList<>();
for (int i = 0; i < numInitialTaskManagers; i++) {
containerList.add(new TestingContainer("container_" + i, "localhost"));
}
doAnswer(new Answer() {
int counter = 0;
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
if (counter < containerList.size()) {
callbackHandler.onContainersAllocated(Collections.singletonList(containerList.get(counter++)));
}
return null;
}
}).when(resourceManagerClient).addContainerRequest(Matchers.any(AMRMClient.ContainerRequest.class));
ActorRef resourceManager = null;
ActorRef leader1;
try {
leader1 = system.actorOf(Props.create(TestingUtils.ForwardingActor.class, getRef(), Option.apply(leaderSessionID)));
resourceManager = system.actorOf(Props.create(TestingYarnFlinkResourceManager.class, flinkConfig, yarnConfig, leaderRetrievalService, applicationMasterHostName, webInterfaceURL, taskManagerParameters, taskManagerLaunchContext, yarnHeartbeatIntervalMillis, maxFailedContainers, numInitialTaskManagers, callbackHandler, resourceManagerClient, nodeManagerClient));
leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
final AkkaActorGateway leader1Gateway = new AkkaActorGateway(leader1, leaderSessionID);
final AkkaActorGateway resourceManagerGateway = new AkkaActorGateway(resourceManager, leaderSessionID);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
Container container = (Container) invocation.getArguments()[0];
resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
return null;
}
}).when(nodeManagerClient).startContainer(Matchers.any(Container.class), Matchers.any(ContainerLaunchContext.class));
expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
for (int i = 0; i < containerList.size(); i++) {
expectMsgClass(deadline.timeLeft(), Acknowledge.class);
}
Future<Object> taskManagerRegisteredFuture = resourceManagerGateway.ask(new NotifyWhenResourcesRegistered(numInitialTaskManagers), deadline.timeLeft());
Await.ready(taskManagerRegisteredFuture, deadline.timeLeft());
leaderRetrievalService.notifyListener(null, null);
leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
for (Container container : containerList) {
resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
}
for (int i = 0; i < containerList.size(); i++) {
expectMsgClass(deadline.timeLeft(), Acknowledge.class);
}
Future<Object> numberOfRegisteredResourcesFuture = resourceManagerGateway.ask(RequestNumberOfRegisteredResources.Instance, deadline.timeLeft());
int numberOfRegisteredResources = (Integer) Await.result(numberOfRegisteredResourcesFuture, deadline.timeLeft());
assertEquals(numInitialTaskManagers, numberOfRegisteredResources);
} finally {
if (resourceManager != null) {
resourceManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
};
}
use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.
/**
* Tests that the JobManager logs failures during recovery properly.
*
* @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
final Deadline testDeadline = TestTimeOut.fromNow();
final String zooKeeperQuorum = ZooKeeper.getConnectString();
final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
ActorSystem testActorSystem = null;
try {
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Get the leader
leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
JobManagerProcess nonLeadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
nonLeadingJobManagerProcess = jobManagerProcess[1];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
nonLeadingJobManagerProcess = jobManagerProcess[0];
}
// Blocking JobGraph
JobVertex blockingVertex = new JobVertex("Blocking vertex");
blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
JobGraph jobGraph = new JobGraph(blockingVertex);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to be running
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
// Remove all files
FileUtils.deleteDirectory(FileStateBackendBasePath);
// Kill the leader
leadingJobManagerProcess.destroy();
// Verify that the job manager logs the failed recovery. We can not
// do more at this point. :(
boolean success = false;
while (testDeadline.hasTimeLeft()) {
String output = nonLeadingJobManagerProcess.getProcessOutput();
if (output != null) {
if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
success = true;
break;
}
} else {
log.warn("No process output available.");
}
Thread.sleep(500);
}
assertTrue("Did not find expected output in logs.", success);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.
/**
* Tests that clients receive updates after recovery by a new leader.
*/
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Test actor system
ActorSystem testSystem = null;
// JobManager setup. Start the job managers as separate processes in order to not run the
// actors postStop, which cleans up all running jobs.
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Client test actor
TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
JobGraph jobGraph = createBlockingJobGraph();
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// The client
AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
int numSlots = 0;
while (numSlots == 0) {
Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
}
// Submit the job in non-detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
// Cancel the job
leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
}
// Wait for the execution result
clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
int jobSubmitSuccessMessages = 0;
for (Object msg : clientRef.underlyingActor().getMessages()) {
if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
jobSubmitSuccessMessages++;
}
}
// At least two submissions should be ack-ed (initial and recovery). This is quite
// conservative, but it is still possible that these messages are overtaken by the
// final message.
assertEquals(2, jobSubmitSuccessMessages);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.instance.AkkaActorGateway in project flink by apache.
the class JobManagerHAProcessFailureBatchRecoveryITCase method testJobManagerProcessFailure.
@Test
public void testJobManagerProcessFailure() throws Exception {
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Setup
// Test actor system
ActorSystem testActorSystem;
// Job managers
final JobManagerProcess[] jmProcess = new JobManagerProcess[numberOfJobManagers];
// Task managers
final ActorSystem[] tmActorSystem = new ActorSystem[numberOfTaskManagers];
// Leader election service
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Coordination directory
coordinateTempDir = createTempDirectory();
// Job Managers
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Start first process
jmProcess[0] = new JobManagerProcess(0, config);
jmProcess[0].startProcess();
// Task manager configuration
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 4);
config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 100);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
tmActorSystem[i] = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), tmActorSystem[i], "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
}
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
jmProcess[0].getActorRef(testActorSystem, deadline.timeLeft());
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, deadline.timeLeft());
ActorGateway leaderGateway = new AkkaActorGateway(leaderRef, leaderId);
// Wait for all task managers to connect to the leading job manager
JobManagerActorTestUtils.waitForTaskManagers(numberOfTaskManagers, leaderGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(ZooKeeper.getConnectString(), coordinateDirClosure);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
//start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
jmProcess[0].destroy();
jmProcess[1] = new JobManagerProcess(1, config);
jmProcess[1].startProcess();
jmProcess[1].getActorRef(testActorSystem, deadline.timeLeft());
// we create the marker file which signals the program functions tasks that they can complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (JobManagerProcess p : jmProcess) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (tmActorSystem[i] != null) {
tmActorSystem[i].shutdown();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (JobManagerProcess jmProces : jmProcess) {
if (jmProces != null) {
jmProces.destroy();
}
}
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
Aggregations