use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerProcessReapingTestBase method testReapProcessOnFailure.
@Test
public void testReapProcessOnFailure() {
Process taskManagerProcess = null;
ActorSystem jmActorSystem = null;
final StringWriter processOutput = new StringWriter();
try {
String javaCommand = getJavaCommandPath();
// is available on this machine
if (javaCommand == null) {
System.out.println("---- Skipping TaskManagerProcessReapingTest : Could not find java executable ----");
return;
}
// create a logging file for the process
File tempLogFile = File.createTempFile("testlogconfig", "properties");
tempLogFile.deleteOnExit();
CommonTestUtils.printLog4jDebugConfig(tempLogFile);
final int jobManagerPort = NetUtils.getAvailablePort();
// start a JobManager
Tuple2<String, Object> localAddress = new Tuple2<String, Object>("localhost", jobManagerPort);
jmActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<Tuple2<String, Object>>(localAddress));
ActorRef jmActor = JobManager.startJobManagerActors(new Configuration(), jmActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1;
// start a ResourceManager
StandaloneLeaderRetrievalService standaloneLeaderRetrievalService = new StandaloneLeaderRetrievalService(AkkaUtils.getAkkaURL(jmActorSystem, jmActor));
FlinkResourceManager.startResourceManagerActors(new Configuration(), jmActorSystem, standaloneLeaderRetrievalService, StandaloneResourceManager.class);
final int taskManagerPort = NetUtils.getAvailablePort();
// start the task manager process
String[] command = new String[] { javaCommand, "-Dlog.level=DEBUG", "-Dlog4j.configuration=file:" + tempLogFile.getAbsolutePath(), "-Xms256m", "-Xmx256m", "-classpath", getCurrentClasspath(), TaskManagerTestEntryPoint.class.getName(), String.valueOf(jobManagerPort), String.valueOf(taskManagerPort) };
ProcessBuilder bld = new ProcessBuilder(command);
taskManagerProcess = bld.start();
new PipeForwarder(taskManagerProcess.getErrorStream(), processOutput);
// grab the reference to the TaskManager. try multiple times, until the process
// is started and the TaskManager is up
String taskManagerActorName = String.format("akka.tcp://flink@%s/user/%s", "localhost:" + taskManagerPort, TaskManager.TASK_MANAGER_NAME());
ActorRef taskManagerRef = null;
Throwable lastError = null;
for (int i = 0; i < 40; i++) {
try {
taskManagerRef = TaskManager.getTaskManagerRemoteReference(taskManagerActorName, jmActorSystem, new FiniteDuration(25, TimeUnit.SECONDS));
break;
} catch (Throwable t) {
// TaskManager probably not ready yet
lastError = t;
}
Thread.sleep(500);
}
assertTrue("TaskManager process died", isProcessAlive(taskManagerProcess));
if (taskManagerRef == null) {
if (lastError != null) {
lastError.printStackTrace();
}
fail("TaskManager process did not launch the TaskManager properly. Failed to look up " + taskManagerActorName);
}
// kill the TaskManager actor
onTaskManagerProcessRunning(taskManagerRef);
// wait for max 5 seconds for the process to terminate
{
long now = System.currentTimeMillis();
long deadline = now + 10000;
while (now < deadline && isProcessAlive(taskManagerProcess)) {
Thread.sleep(100);
now = System.currentTimeMillis();
}
}
assertFalse("TaskManager process did not terminate upon actor death", isProcessAlive(taskManagerProcess));
int returnCode = taskManagerProcess.exitValue();
assertEquals("TaskManager died, but not because of the process reaper", TaskManager.RUNTIME_FAILURE_RETURN_CODE(), returnCode);
onTaskManagerProcessTerminated(processOutput.toString());
} catch (Exception e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
fail(e.getMessage());
} catch (Error e) {
e.printStackTrace();
printProcessLog(processOutput.toString());
throw e;
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (jmActorSystem != null) {
jmActorSystem.shutdown();
}
}
}
use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerTest method testTriggerStackTraceSampleMessage.
// ------------------------------------------------------------------------
// Stack trace sample
// ------------------------------------------------------------------------
/**
* Tests sampling of task stack traces.
*/
@Test
@SuppressWarnings("unchecked")
public void testTriggerStackTraceSampleMessage() throws Exception {
new JavaTestKit(system) {
{
ActorGateway taskManagerActorGateway = null;
// We need this to be a JM that answers to update messages for
// robustness on Travis (if jobs need to be resubmitted in (4)).
ActorRef jm = system.actorOf(Props.create(new SimpleLookupJobManagerCreator(null)));
ActorGateway jobManagerActorGateway = new AkkaActorGateway(jm, null);
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final ActorGateway jobManager = jobManagerActorGateway;
final ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, false);
final JobID jobId = new JobID();
// Single blocking task
final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jobId, "Job", new JobVertexID(), new ExecutionAttemptID(), new SerializedValue<>(new ExecutionConfig()), "Task", 1, 0, 1, 0, new Configuration(), new Configuration(), BlockingNoOpInvokable.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
// Submit the task
new Within(d) {
@Override
protected void run() {
try {
// Make sure to register
Future<?> connectFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor()), remaining());
Await.ready(connectFuture, remaining());
Future<Object> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
taskManager.tell(new SubmitTask(tdd));
Await.ready(taskRunningFuture, d);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
//
// 1) Trigger sample for non-existing task
//
new Within(d) {
@Override
protected void run() {
try {
ExecutionAttemptID taskId = new ExecutionAttemptID();
taskManager.tell(new TriggerStackTraceSample(112223, taskId, 100, timeD, 0), testActorGateway);
// Receive the expected message (heartbeat races possible)
Object[] msg = receiveN(1);
while (!(msg[0] instanceof Status.Failure)) {
msg = receiveN(1);
}
Status.Failure response = (Status.Failure) msg[0];
assertEquals(IllegalStateException.class, response.cause().getClass());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
//
// 2) Trigger sample for the blocking task
//
new Within(d) {
@Override
protected void run() {
boolean success = false;
Throwable lastError = null;
for (int i = 0; i < 100 && !success; i++) {
try {
int numSamples = 5;
taskManager.tell(new TriggerStackTraceSample(19230, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), 0), testActorGateway);
// Receive the expected message (heartbeat races possible)
Object[] msg = receiveN(1);
while (!(msg[0] instanceof StackTraceSampleResponse)) {
msg = receiveN(1);
}
StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
// ---- Verify response ----
assertEquals(19230, response.getSampleId());
assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
List<StackTraceElement[]> traces = response.getSamples();
assertEquals("Number of samples", numSamples, traces.size());
for (StackTraceElement[] trace : traces) {
// Look for BlockingNoOpInvokable#invoke
for (StackTraceElement elem : trace) {
if (elem.getClassName().equals(BlockingNoOpInvokable.class.getName())) {
assertEquals("invoke", elem.getMethodName());
success = true;
break;
}
}
assertTrue("Unexpected stack trace: " + Arrays.toString(trace), success);
}
} catch (Throwable t) {
lastError = t;
LOG.warn("Failed to find invokable.", t);
}
try {
Thread.sleep(100);
} catch (InterruptedException e) {
LOG.error("Interrupted while sleeping before retry.", e);
break;
}
}
if (!success) {
if (lastError == null) {
fail("Failed to find invokable");
} else {
fail(lastError.getMessage());
}
}
}
};
//
// 3) Trigger sample for the blocking task with max depth
//
new Within(d) {
@Override
protected void run() {
try {
int numSamples = 5;
int maxDepth = 2;
taskManager.tell(new TriggerStackTraceSample(1337, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), maxDepth), testActorGateway);
// Receive the expected message (heartbeat races possible)
Object[] msg = receiveN(1);
while (!(msg[0] instanceof StackTraceSampleResponse)) {
msg = receiveN(1);
}
StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
// ---- Verify response ----
assertEquals(1337, response.getSampleId());
assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
List<StackTraceElement[]> traces = response.getSamples();
assertEquals("Number of samples", numSamples, traces.size());
for (StackTraceElement[] trace : traces) {
assertEquals("Max depth", maxDepth, trace.length);
}
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
//
// 4) Trigger sample for the blocking task, but cancel it during sampling
//
new Within(d) {
@Override
protected void run() {
try {
int maxAttempts = 10;
int sleepTime = 100;
for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
// Trigger many samples in order to cancel the task
// during a sample
taskManager.tell(new TriggerStackTraceSample(44, tdd.getExecutionAttemptId(), Integer.MAX_VALUE, Time.milliseconds(10L), 0), testActorGateway);
Thread.sleep(sleepTime);
Future<?> removeFuture = taskManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobId), remaining());
// Cancel the task
taskManager.tell(new CancelTask(tdd.getExecutionAttemptId()));
// Receive the expected message (heartbeat races possible)
while (true) {
Object[] msg = receiveN(1);
if (msg[0] instanceof StackTraceSampleResponse) {
StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
assertEquals(44, response.getSampleId());
// Done
return;
} else if (msg[0] instanceof Failure) {
// Wait for removal before resubmitting
Await.ready(removeFuture, remaining());
Future<?> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
// Resubmit
taskManager.tell(new SubmitTask(tdd));
Await.ready(taskRunningFuture, remaining());
// Retry the sample message
break;
} else {
// Different message
continue;
}
}
}
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} finally {
TestingUtils.stopActor(taskManagerActorGateway);
TestingUtils.stopActor(jobManagerActorGateway);
}
}
};
}
use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerTest method testCancellingDependentAndStateUpdateFails.
@Test
public void testCancellingDependentAndStateUpdateFails() {
// this tests creates two tasks. the sender sends data, and fails to send the
// state update back to the job manager
// the second one blocks to be canceled
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final JobID jid = new JobID();
JobVertexID vid1 = new JobVertexID();
JobVertexID vid2 = new JobVertexID();
final ExecutionAttemptID eid1 = new ExecutionAttemptID();
final ExecutionAttemptID eid2 = new ExecutionAttemptID();
ActorRef jm = system.actorOf(Props.create(new SimpleLookupFailingUpdateJobManagerCreator(leaderSessionID, eid2)));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, true);
final ActorGateway tm = taskManager;
IntermediateResultPartitionID partitionId = new IntermediateResultPartitionID();
List<ResultPartitionDeploymentDescriptor> irpdd = new ArrayList<ResultPartitionDeploymentDescriptor>();
irpdd.add(new ResultPartitionDeploymentDescriptor(new IntermediateDataSetID(), partitionId, ResultPartitionType.PIPELINED, 1, 1, true));
InputGateDeploymentDescriptor ircdd = new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(new ResultPartitionID(partitionId, eid1), ResultPartitionLocation.createLocal()) });
final TaskDeploymentDescriptor tdd1 = createTaskDeploymentDescriptor(jid, "TestJob", vid1, eid1, new SerializedValue<>(new ExecutionConfig()), "Sender", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.Sender.class.getName(), irpdd, Collections.<InputGateDeploymentDescriptor>emptyList(), new ArrayList<BlobKey>(), Collections.<URL>emptyList(), 0);
final TaskDeploymentDescriptor tdd2 = createTaskDeploymentDescriptor(jid, "TestJob", vid2, eid2, new SerializedValue<>(new ExecutionConfig()), "Receiver", 7, 2, 7, 0, new Configuration(), new Configuration(), Tasks.BlockingReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(ircdd), new ArrayList<BlobKey>(), Collections.<URL>emptyList(), 0);
new Within(d) {
@Override
protected void run() {
try {
Future<Object> t1Running = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(eid1), timeout);
Future<Object> t2Running = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(eid2), timeout);
tm.tell(new SubmitTask(tdd2), testActorGateway);
tm.tell(new SubmitTask(tdd1), testActorGateway);
expectMsgEquals(Acknowledge.get());
expectMsgEquals(Acknowledge.get());
Await.ready(t1Running, d);
Await.ready(t2Running, d);
tm.tell(TestingTaskManagerMessages.getRequestRunningTasksMessage(), testActorGateway);
Map<ExecutionAttemptID, Task> tasks = expectMsgClass(TestingTaskManagerMessages.ResponseRunningTasks.class).asJava();
Task t1 = tasks.get(eid1);
Task t2 = tasks.get(eid2);
tm.tell(new CancelTask(eid2), testActorGateway);
expectMsgEquals(Acknowledge.get());
if (t2 != null) {
Future<Object> response = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskRemoved(eid2), timeout);
Await.ready(response, d);
}
if (t1 != null) {
if (t1.getExecutionState() == ExecutionState.RUNNING) {
tm.tell(new CancelTask(eid1), testActorGateway);
expectMsgEquals(Acknowledge.get());
}
Future<Object> response = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskRemoved(eid1), timeout);
Await.ready(response, d);
}
tm.tell(TestingTaskManagerMessages.getRequestRunningTasksMessage(), testActorGateway);
tasks = expectMsgClass(TestingTaskManagerMessages.ResponseRunningTasks.class).asJava();
assertEquals(0, tasks.size());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
// shut down the actors
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerTest method testSubmitTaskFailure.
/**
* Tests that the TaskManager sends a proper exception back to the sender if the submit task
* message fails.
*/
@Test
public void testSubmitTaskFailure() throws Exception {
ActorGateway jobManager = null;
ActorGateway taskManager = null;
try {
ActorRef jm = system.actorOf(Props.create(SimpleJobManager.class, leaderSessionID));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, true);
TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(new JobID(), "test job", new JobVertexID(), new ExecutionAttemptID(), new SerializedValue<>(new ExecutionConfig()), "test task", // this will make the submission fail because the number of key groups must be >= 1
0, 0, 1, 0, new Configuration(), new Configuration(), "Foobar", Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
Future<Object> submitResponse = taskManager.ask(new SubmitTask(tdd), timeout);
try {
Await.result(submitResponse, timeout);
fail("The submit task message should have failed.");
} catch (IllegalArgumentException e) {
// expected
}
} finally {
TestingUtils.stopActor(jobManager);
TestingUtils.stopActor(taskManager);
}
}
use of akka.actor.ActorRef in project flink by apache.
the class TaskManagerTest method testRemotePartitionNotFound.
/**
* Tests that repeated remote {@link PartitionNotFoundException}s ultimately fail the receiver.
*/
@Test
public void testRemotePartitionNotFound() throws Exception {
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final IntermediateDataSetID resultId = new IntermediateDataSetID();
// Create the JM
ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
final int dataPort = NetUtils.getAvailablePort();
Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, dataPort);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
taskManager = TestingUtils.createTaskManager(system, jobManager, config, false, true);
// ---------------------------------------------------------------------------------
final ActorGateway tm = taskManager;
final JobID jid = new JobID();
final JobVertexID vid = new JobVertexID();
final ExecutionAttemptID eid = new ExecutionAttemptID();
final ResultPartitionID partitionId = new ResultPartitionID();
// Remote location (on the same TM though) for the partition
final ResultPartitionLocation loc = ResultPartitionLocation.createRemote(new ConnectionID(new InetSocketAddress("localhost", dataPort), 0));
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
new Within(d) {
@Override
protected void run() {
// Submit the task
tm.tell(new SubmitTask(tdd), testActorGateway);
expectMsgClass(Acknowledge.get().getClass());
// Wait to be notified about the final execution state by the mock JM
TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
// The task should fail after repeated requests
assertEquals(ExecutionState.FAILED, msg.getExecutionState());
Throwable t = msg.getError(ClassLoader.getSystemClassLoader());
assertEquals("Thrown exception was not a PartitionNotFoundException: " + t.getMessage(), PartitionNotFoundException.class, t.getClass());
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
Aggregations