use of akka.actor.ActorSystem in project flink by apache.
the class LocalFlinkMiniClusterITCase method testLocalFlinkMiniClusterWithMultipleTaskManagers.
@Test
public void testLocalFlinkMiniClusterWithMultipleTaskManagers() {
final ActorSystem system = ActorSystem.create("Testkit", AkkaUtils.getDefaultAkkaConfig());
LocalFlinkMiniCluster miniCluster = null;
final int numTMs = 3;
final int numSlots = 14;
// gather the threads that already exist
final Set<Thread> threadsBefore = new HashSet<>();
{
final Thread[] allThreads = new Thread[Thread.activeCount()];
Thread.enumerate(allThreads);
threadsBefore.addAll(Arrays.asList(allThreads));
}
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTMs);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlots);
miniCluster = new LocalFlinkMiniCluster(config, true);
miniCluster.start();
final ActorGateway jmGateway = miniCluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
new JavaTestKit(system) {
{
final ActorGateway selfGateway = new AkkaActorGateway(getRef(), null);
new Within(TestingUtils.TESTING_DURATION()) {
@Override
protected void run() {
jmGateway.tell(JobManagerMessages.getRequestNumberRegisteredTaskManager(), selfGateway);
expectMsgEquals(TestingUtils.TESTING_DURATION(), numTMs);
jmGateway.tell(JobManagerMessages.getRequestTotalNumberOfSlots(), selfGateway);
expectMsgEquals(TestingUtils.TESTING_DURATION(), numTMs * numSlots);
}
};
}
};
} finally {
if (miniCluster != null) {
miniCluster.stop();
miniCluster.awaitTermination();
}
JavaTestKit.shutdownActorSystem(system);
system.awaitTermination();
}
// shut down the global execution context, to make sure it does not affect this testing
try {
Field f = ExecutionContextImpl.class.getDeclaredField("executor");
f.setAccessible(true);
Object exec = ExecutionContext$.MODULE$.global();
ForkJoinPool executor = (ForkJoinPool) f.get(exec);
executor.shutdownNow();
} catch (Exception e) {
System.err.println("Cannot test proper thread shutdown for local execution.");
return;
}
// check for remaining threads
// we need to check repeatedly for a while, because some threads shut down slowly
long deadline = System.currentTimeMillis() + 30000;
boolean foundThreads = true;
String threadName = "";
while (System.currentTimeMillis() < deadline) {
// check that no additional threads remain
final Thread[] threadsAfter = new Thread[Thread.activeCount()];
Thread.enumerate(threadsAfter);
foundThreads = false;
for (Thread t : threadsAfter) {
if (t.isAlive() && !threadsBefore.contains(t)) {
// this thread was not there before. check if it is allowed
boolean allowed = false;
for (String prefix : ALLOWED_THREAD_PREFIXES) {
if (t.getName().startsWith(prefix)) {
allowed = true;
break;
}
}
if (!allowed) {
foundThreads = true;
threadName = t.toString();
break;
}
}
}
if (foundThreads) {
try {
Thread.sleep(500);
} catch (InterruptedException ignored) {
}
} else {
break;
}
}
if (foundThreads) {
fail("Thread " + threadName + " was started by the mini cluster, but not shut down");
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class TaskManagerFailureRecoveryITCase method testRestartWithFailingTaskManager.
@Test
public void testRestartWithFailingTaskManager() {
final int PARALLELISM = 4;
LocalFlinkMiniCluster cluster = null;
ActorSystem additionalSystem = null;
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, PARALLELISM);
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "500 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "20 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 20);
cluster = new LocalFlinkMiniCluster(config, false);
cluster.start();
// for the result
List<Long> resultCollection = new ArrayList<Long>();
final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
env.setParallelism(PARALLELISM);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 1000));
env.getConfig().disableSysoutLogging();
env.generateSequence(1, 10).map(new FailingMapper<Long>()).reduce(new ReduceFunction<Long>() {
@Override
public Long reduce(Long value1, Long value2) {
return value1 + value2;
}
}).output(new LocalCollectionOutputFormat<Long>(resultCollection));
// simple reference (atomic does not matter) to pass back an exception from the trigger thread
final AtomicReference<Throwable> ref = new AtomicReference<Throwable>();
// trigger the execution from a separate thread, so we are available to temper with the
// cluster during the execution
Thread trigger = new Thread("program trigger") {
@Override
public void run() {
try {
env.execute();
} catch (Throwable t) {
ref.set(t);
}
}
};
trigger.setDaemon(true);
trigger.start();
// the mappers in turn are waiting
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.TASK_TO_COORD_QUEUE.take();
}
// bring up one more task manager and wait for it to appear
{
additionalSystem = cluster.startTaskManagerActorSystem(2);
ActorRef additionalTaskManager = cluster.startTaskManager(2, additionalSystem);
Object message = TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage();
Future<Object> future = Patterns.ask(additionalTaskManager, message, 30000);
try {
Await.result(future, new FiniteDuration(30000, TimeUnit.MILLISECONDS));
} catch (TimeoutException e) {
fail("The additional TaskManager did not come up within 30 seconds");
}
}
// kill the two other TaskManagers
for (ActorRef tm : cluster.getTaskManagersAsJava()) {
tm.tell(PoisonPill.getInstance(), null);
}
// wait for the next set of mappers (the recovery ones) to come online
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.TASK_TO_COORD_QUEUE.take();
}
// tell the mappers that they may continue this time
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.COORD_TO_TASK_QUEUE.add(new Object());
}
// wait for the program to finish
trigger.join();
if (ref.get() != null) {
Throwable t = ref.get();
t.printStackTrace();
fail("Program execution caused an exception: " + t.getMessage());
}
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (additionalSystem != null) {
additionalSystem.shutdown();
}
if (cluster != null) {
cluster.stop();
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerTest method testSavepointRestoreSettings.
/**
* Tests that configured {@link SavepointRestoreSettings} are respected.
*/
@Test
public void testSavepointRestoreSettings() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
Configuration tmConfig = new Configuration();
tmConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 4);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(tmConfig, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), // deactivated checkpointing
Long.MAX_VALUE, 360000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Trigger savepoint
File targetDirectory = tmpFolder.newFolder();
msg = new TriggerSavepoint(jobGraph.getJobID(), Option.apply(targetDirectory.getAbsolutePath()));
Future<Object> future = jobManager.ask(msg, timeout);
Object result = Await.result(future, timeout);
String savepointPath = ((TriggerSavepointSuccess) result).savepointPath();
// Cancel because of restarts
msg = new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID());
Future<?> removedFuture = jobManager.ask(msg, timeout);
Future<?> cancelFuture = jobManager.ask(new CancelJob(jobGraph.getJobID()), timeout);
Object response = Await.result(cancelFuture, timeout);
assertTrue("Unexpected response: " + response, response instanceof CancellationSuccess);
Await.ready(removedFuture, timeout);
// Adjust the job (we need a new operator ID)
JobVertex newSourceVertex = new JobVertex("NewSource");
newSourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
newSourceVertex.setParallelism(1);
JobGraph newJobGraph = new JobGraph("NewTestingJob", newSourceVertex);
JobSnapshottingSettings newSnapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(newSourceVertex.getID()), Collections.singletonList(newSourceVertex.getID()), Collections.singletonList(newSourceVertex.getID()), // deactivated checkpointing
Long.MAX_VALUE, 360000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
newJobGraph.setSnapshotSettings(newSnapshottingSettings);
SavepointRestoreSettings restoreSettings = SavepointRestoreSettings.forPath(savepointPath, false);
newJobGraph.setSavepointRestoreSettings(restoreSettings);
msg = new JobManagerMessages.SubmitJob(newJobGraph, ListeningBehaviour.DETACHED);
response = Await.result(jobManager.ask(msg, timeout), timeout);
assertTrue("Unexpected response: " + response, response instanceof JobManagerMessages.JobResultFailure);
JobManagerMessages.JobResultFailure failure = (JobManagerMessages.JobResultFailure) response;
Throwable cause = failure.cause().deserializeError(ClassLoader.getSystemClassLoader());
assertTrue(cause instanceof IllegalStateException);
assertTrue(cause.getMessage().contains("allowNonRestoredState"));
// Wait until removed
msg = new TestingJobManagerMessages.NotifyWhenJobRemoved(newJobGraph.getJobID());
Await.ready(jobManager.ask(msg, timeout), timeout);
// Resubmit, but allow non restored state now
restoreSettings = SavepointRestoreSettings.forPath(savepointPath, true);
newJobGraph.setSavepointRestoreSettings(restoreSettings);
msg = new JobManagerMessages.SubmitJob(newJobGraph, ListeningBehaviour.DETACHED);
response = Await.result(jobManager.ask(msg, timeout), timeout);
assertTrue("Unexpected response: " + response, response instanceof JobManagerMessages.JobSubmitSuccess);
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerTest method testCancelWithSavepoint.
@Test
public void testCancelWithSavepoint() throws Exception {
File defaultSavepointDir = tmpFolder.newFolder();
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
Configuration config = new Configuration();
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, defaultSavepointDir.getAbsolutePath());
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), 3600000, 3600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Notify when canelled
msg = new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.CANCELED);
Future<Object> cancelled = jobManager.ask(msg, timeout);
// Cancel with savepoint
String savepointPath = null;
for (int i = 0; i < 10; i++) {
msg = new JobManagerMessages.CancelJobWithSavepoint(jobGraph.getJobID(), null);
CancellationResponse cancelResp = (CancellationResponse) Await.result(jobManager.ask(msg, timeout), timeout);
if (cancelResp instanceof CancellationFailure) {
CancellationFailure failure = (CancellationFailure) cancelResp;
if (failure.cause().getMessage().contains(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING.message())) {
// wait and retry
Thread.sleep(200);
} else {
failure.cause().printStackTrace();
fail("Failed to cancel job: " + failure.cause().getMessage());
}
} else {
savepointPath = ((CancellationSuccess) cancelResp).savepointPath();
break;
}
}
// Verify savepoint path
assertNotEquals("Savepoint not triggered", null, savepointPath);
// Wait for job status change
Await.ready(cancelled, timeout);
File savepointFile = new File(savepointPath);
assertEquals(true, savepointFile.exists());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of akka.actor.ActorSystem in project flink by apache.
the class JobManagerTest method testSavepointWithDeactivatedPeriodicCheckpointing.
/**
* Tests that we can trigger a savepoint when periodic checkpoints are disabled.
*/
@Test
public void testSavepointWithDeactivatedPeriodicCheckpointing() throws Exception {
File defaultSavepointDir = tmpFolder.newFolder();
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
Configuration config = new Configuration();
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, defaultSavepointDir.getAbsolutePath());
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), // deactivated checkpointing
Long.MAX_VALUE, 360000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Cancel with savepoint
File targetDirectory = tmpFolder.newFolder();
msg = new TriggerSavepoint(jobGraph.getJobID(), Option.apply(targetDirectory.getAbsolutePath()));
Future<Object> future = jobManager.ask(msg, timeout);
Object result = Await.result(future, timeout);
assertTrue("Did not trigger savepoint", result instanceof TriggerSavepointSuccess);
assertEquals(1, targetDirectory.listFiles().length);
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
Aggregations