use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class YARNHighAvailabilityITCase method testMultipleAMKill.
/**
* Tests that the application master can be killed multiple times and that the surviving
* TaskManager successfully reconnects to the newly started JobManager.
* @throws Exception
*/
@Test
public void testMultipleAMKill() throws Exception {
final int numberKillingAttempts = numberApplicationAttempts - 1;
TestingYarnClusterDescriptor flinkYarnClient = new TestingYarnClusterDescriptor();
Assert.assertNotNull("unable to get yarn client", flinkYarnClient);
flinkYarnClient.setTaskManagerCount(1);
flinkYarnClient.setJobManagerMemory(768);
flinkYarnClient.setTaskManagerMemory(1024);
flinkYarnClient.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
flinkYarnClient.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
String confDirPath = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR);
flinkYarnClient.setConfigurationDirectory(confDirPath);
String fsStateHandlePath = temp.getRoot().getPath();
// load the configuration
File configDirectory = new File(confDirPath);
GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
flinkYarnClient.setFlinkConfiguration(GlobalConfiguration.loadConfiguration());
flinkYarnClient.setDynamicPropertiesEncoded("recovery.mode=zookeeper@@recovery.zookeeper.quorum=" + zkServer.getConnectString() + "@@yarn.application-attempts=" + numberApplicationAttempts + "@@" + CoreOptions.STATE_BACKEND + "=FILESYSTEM" + "@@" + FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY + "=" + fsStateHandlePath + "/checkpoints" + "@@" + HighAvailabilityOptions.HA_STORAGE_PATH.key() + "=" + fsStateHandlePath + "/recovery");
flinkYarnClient.setConfigurationFilePath(new Path(confDirPath + File.separator + "flink-conf.yaml"));
ClusterClient yarnCluster = null;
final FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
try {
yarnCluster = flinkYarnClient.deploy();
final Configuration config = yarnCluster.getFlinkConfiguration();
new JavaTestKit(actorSystem) {
{
for (int attempt = 0; attempt < numberKillingAttempts; attempt++) {
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway.leaderSessionID());
gateway.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
gateway.tell(PoisonPill.getInstance());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway2 = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway2.leaderSessionID());
gateway2.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
};
} finally {
if (yarnCluster != null) {
yarnCluster.shutdown();
}
}
}
use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class YarnFlinkApplicationMasterRunner method createRpcService.
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
protected RpcService createRpcService(Configuration configuration, String bindAddress, String portRange) throws Exception {
ActorSystem actorSystem = BootstrapTools.startActorSystem(configuration, bindAddress, portRange, LOG);
FiniteDuration duration = AkkaUtils.getTimeout(configuration);
return new AkkaRpcService(actorSystem, Time.of(duration.length(), duration.unit()));
}
use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class ExecutionGraphRestartTest method testConstraintsAfterRestart.
@Test
public void testConstraintsAfterRestart() throws Exception {
//setting up
Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), NUM_TASKS);
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
scheduler.newInstanceAvailable(instance);
JobVertex groupVertex = newJobVertex("Task1", NUM_TASKS, NoOpInvokable.class);
JobVertex groupVertex2 = newJobVertex("Task2", NUM_TASKS, NoOpInvokable.class);
SlotSharingGroup sharingGroup = new SlotSharingGroup();
groupVertex.setSlotSharingGroup(sharingGroup);
groupVertex2.setSlotSharingGroup(sharingGroup);
groupVertex.setStrictlyCoLocatedWith(groupVertex2);
//initiate and schedule job
JobGraph jobGraph = new JobGraph("Pointwise job", groupVertex, groupVertex2);
ExecutionGraph eg = newExecutionGraph(new FixedDelayRestartStrategy(1, 0L), scheduler);
eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
assertEquals(JobStatus.CREATED, eg.getState());
eg.scheduleForExecution();
assertEquals(JobStatus.RUNNING, eg.getState());
//sanity checks
validateConstraints(eg);
//restart automatically
restartAfterFailure(eg, new FiniteDuration(2, TimeUnit.MINUTES), false);
//checking execution vertex properties
validateConstraints(eg);
haltExecution(eg);
}
use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class ExecutionGraphRestartTest method testFailingExecutionAfterRestart.
/**
* Tests that a failing execution does not affect a restarted job. This is important if a
* callback handler fails an execution after it has already reached a final state and the job
* has been restarted.
*/
@Test
public void testFailingExecutionAfterRestart() throws Exception {
Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), 2);
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
scheduler.newInstanceAvailable(instance);
JobVertex sender = newJobVertex("Task1", 1, NoOpInvokable.class);
JobVertex receiver = newJobVertex("Task2", 1, NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Pointwise job", sender, receiver);
ExecutionGraph eg = newExecutionGraph(new FixedDelayRestartStrategy(1, 1000), scheduler);
eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
assertEquals(JobStatus.CREATED, eg.getState());
eg.scheduleForExecution();
assertEquals(JobStatus.RUNNING, eg.getState());
Iterator<ExecutionVertex> executionVertices = eg.getAllExecutionVertices().iterator();
Execution finishedExecution = executionVertices.next().getCurrentExecutionAttempt();
Execution failedExecution = executionVertices.next().getCurrentExecutionAttempt();
finishedExecution.markFinished();
failedExecution.fail(new Exception("Test Exception"));
failedExecution.cancelingComplete();
FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
waitForAsyncRestart(eg, timeout);
assertEquals(JobStatus.RUNNING, eg.getState());
// Wait for all resources to be assigned after async restart
waitForAllResourcesToBeAssignedAfterAsyncRestart(eg, timeout.fromNow());
// At this point all resources have been assigned
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
assertNotNull("No assigned resource (test instability).", vertex.getCurrentAssignedResource());
vertex.getCurrentExecutionAttempt().switchToRunning();
}
// fail old finished execution, this should not affect the execution
finishedExecution.fail(new Exception("This should have no effect"));
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
vertex.getCurrentExecutionAttempt().markFinished();
}
// the state of the finished execution should have not changed since it is terminal
assertEquals(ExecutionState.FINISHED, finishedExecution.getState());
assertEquals(JobStatus.FINISHED, eg.getState());
}
use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class CoordinatorShutdownTest method testCoordinatorShutsDownOnSuccess.
@Test
public void testCoordinatorShutsDownOnSuccess() {
LocalFlinkMiniCluster cluster = null;
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
cluster = new LocalFlinkMiniCluster(config, true);
cluster.start();
// build a test graph with snapshotting enabled
JobVertex vertex = new JobVertex("Test Vertex");
vertex.setInvokableClass(BlockingInvokable.class);
List<JobVertexID> vertexIdList = Collections.singletonList(vertex.getID());
JobGraph testGraph = new JobGraph("test job", vertex);
testGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexIdList, vertexIdList, vertexIdList, 5000, 60000, 0L, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true));
ActorGateway jmGateway = cluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
FiniteDuration timeout = new FiniteDuration(60, TimeUnit.SECONDS);
JobManagerMessages.SubmitJob submitMessage = new JobManagerMessages.SubmitJob(testGraph, ListeningBehaviour.EXECUTION_RESULT);
// submit is successful, but then the job blocks due to the invokable
Future<Object> submitFuture = jmGateway.ask(submitMessage, timeout);
Await.result(submitFuture, timeout);
// get the execution graph and store the ExecutionGraph reference
Future<Object> jobRequestFuture = jmGateway.ask(new JobManagerMessages.RequestJob(testGraph.getJobID()), timeout);
ExecutionGraph graph = (ExecutionGraph) ((JobManagerMessages.JobFound) Await.result(jobRequestFuture, timeout)).executionGraph();
assertNotNull(graph);
BlockingInvokable.unblock();
graph.waitUntilFinished();
// verify that the coordinator was shut down
CheckpointCoordinator coord = graph.getCheckpointCoordinator();
assertTrue(coord == null || coord.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (cluster != null) {
cluster.shutdown();
cluster.awaitTermination();
}
}
}
Aggregations