use of scala.concurrent.duration.FiniteDuration in project flink by apache.
the class KvStateClientTest method testClientServerIntegration.
* Tests multiple clients querying multiple servers until 100k queries have
* been processed. At this point, the client is shut down and its verified
* that all ongoing requests are failed.
public void testClientServerIntegration() throws Exception {
// Config
final int numServers = 2;
final int numServerEventLoopThreads = 2;
final int numServerQueryThreads = 2;
final int numClientEventLoopThreads = 4;
final int numClientsTasks = 8;
final int batchSize = 16;
final int numKeyGroups = 1;
AbstractStateBackend abstractBackend = new MemoryStateBackend();
KvStateRegistry dummyRegistry = new KvStateRegistry();
DummyEnvironment dummyEnv = new DummyEnvironment("test", 1, 0);
AbstractKeyedStateBackend<Integer> backend = abstractBackend.createKeyedStateBackend(dummyEnv, new JobID(), "test_op", IntSerializer.INSTANCE, numKeyGroups, new KeyGroupRange(0, 0), dummyRegistry.createTaskRegistry(new JobID(), new JobVertexID()));
final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
AtomicKvStateRequestStats clientStats = new AtomicKvStateRequestStats();
KvStateClient client = null;
ExecutorService clientTaskExecutor = null;
final KvStateServer[] server = new KvStateServer[numServers];
try {
client = new KvStateClient(numClientEventLoopThreads, clientStats);
clientTaskExecutor = Executors.newFixedThreadPool(numClientsTasks);
// Create state
ValueStateDescriptor<Integer> desc = new ValueStateDescriptor<>("any", IntSerializer.INSTANCE);
// Create servers
KvStateRegistry[] registry = new KvStateRegistry[numServers];
AtomicKvStateRequestStats[] serverStats = new AtomicKvStateRequestStats[numServers];
final KvStateID[] ids = new KvStateID[numServers];
for (int i = 0; i < numServers; i++) {
registry[i] = new KvStateRegistry();
serverStats[i] = new AtomicKvStateRequestStats();
server[i] = new KvStateServer(InetAddress.getLocalHost(), 0, numServerEventLoopThreads, numServerQueryThreads, registry[i], serverStats[i]);
backend.setCurrentKey(1010 + i);
// Value per server
ValueState<Integer> state = backend.getPartitionedState(VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE, desc);
state.update(201 + i);
// we know it must be a KvStat but this is not exposed to the user via State
InternalKvState<?> kvState = (InternalKvState<?>) state;
// Register KvState (one state instance for all server)
ids[i] = registry[i].registerKvState(new JobID(), new JobVertexID(), new KeyGroupRange(0, 0), "any", kvState);
final KvStateClient finalClient = client;
Callable<Void> queryTask = new Callable<Void>() {
public Void call() throws Exception {
while (true) {
if (Thread.interrupted()) {
throw new InterruptedException();
// Random server permutation
List<Integer> random = new ArrayList<>();
for (int j = 0; j < batchSize; j++) {
// Dispatch queries
List<Future<byte[]>> futures = new ArrayList<>(batchSize);
for (int j = 0; j < batchSize; j++) {
int targetServer = random.get(j) % numServers;
byte[] serializedKeyAndNamespace = KvStateRequestSerializer.serializeKeyAndNamespace(1010 + targetServer, IntSerializer.INSTANCE, VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE);
futures.add(finalClient.getKvState(server[targetServer].getAddress(), ids[targetServer], serializedKeyAndNamespace));
// Verify results
for (int j = 0; j < batchSize; j++) {
int targetServer = random.get(j) % numServers;
Future<byte[]> future = futures.get(j);
byte[] buf = Await.result(future, timeout);
int value = KvStateRequestSerializer.deserializeValue(buf, IntSerializer.INSTANCE);
assertEquals(201 + targetServer, value);
// Submit tasks
List<java.util.concurrent.Future<Void>> taskFutures = new ArrayList<>();
for (int i = 0; i < numClientsTasks; i++) {
long numRequests;
while ((numRequests = clientStats.getNumRequests()) < 100_000) {
Thread.sleep(100);"Number of requests {}/100_000", numRequests);
// Shut down
for (java.util.concurrent.Future<Void> future : taskFutures) {
try {
fail("Did not throw expected Exception after shut down");
} catch (ExecutionException t) {
if (t.getCause() instanceof ClosedChannelException || t.getCause() instanceof IllegalStateException) {
// Expected
} else {
fail("Failed with unexpected Exception type: " + t.getClass().getName());
assertEquals("Connection leak (client)", 0, clientStats.getNumConnections());
for (int i = 0; i < numServers; i++) {
boolean success = false;
int numRetries = 0;
while (!success) {
try {
assertEquals("Connection leak (server)", 0, serverStats[i].getNumConnections());
success = true;
} catch (Throwable t) {
if (numRetries < 10) {"Retrying connection leak check (server)");
Thread.sleep((numRetries + 1) * 50);
} else {
throw t;
} finally {
if (client != null) {
for (int i = 0; i < numServers; i++) {
if (server[i] != null) {
if (clientTaskExecutor != null) {
the class ProcessFailureCancelingITCase method waitUntilNumTaskManagersAreRegistered.
private void waitUntilNumTaskManagersAreRegistered(ActorRef jobManager, int numExpected, long maxDelay) throws Exception {
final long deadline = System.currentTimeMillis() + maxDelay;
while (true) {
long remaining = deadline - System.currentTimeMillis();
if (remaining <= 0) {
fail("The TaskManagers did not register within the expected time (" + maxDelay + "msecs)");
FiniteDuration timeout = new FiniteDuration(remaining, TimeUnit.MILLISECONDS);
try {
Future<?> result = Patterns.ask(jobManager, JobManagerMessages.getRequestNumberRegisteredTaskManager(), new Timeout(timeout));
Integer numTMs = (Integer) Await.result(result, timeout);
if (numTMs == numExpected) {
} catch (TimeoutException e) {
// ignore and retry
} catch (ClassCastException e) {
fail("Wrong response: " + e.getMessage());
the class ProcessFailureCancelingITCase method cancelRunningJob.
private void cancelRunningJob(ActorRef jobManager) throws Exception {
final FiniteDuration askTimeout = new FiniteDuration(10, TimeUnit.SECONDS);
// try at most for 30 seconds
final long deadline = System.currentTimeMillis() + 30000;
JobID jobId = null;
do {
Future<Object> response = Patterns.ask(jobManager, JobManagerMessages.getRequestRunningJobsStatus(), new Timeout(askTimeout));
Object result;
try {
result = Await.result(response, askTimeout);
} catch (Exception e) {
throw new Exception("Could not retrieve running jobs from the JobManager.", e);
if (result instanceof JobManagerMessages.RunningJobsStatus) {
List<JobStatusMessage> jobs = ((JobManagerMessages.RunningJobsStatus) result).getStatusMessages();
if (jobs.size() == 1) {
jobId = jobs.get(0).getJobId();
} while (System.currentTimeMillis() < deadline);
if (jobId == null) {
// we never found it running, must have failed already
// tell the JobManager to cancel the job
jobManager.tell(new JobManagerMessages.CancelJob(jobId), ActorRef.noSender());
the class ChaosMonkeyITCase method testChaosMonkey.
public void testChaosMonkey() throws Exception {
// Test config
final int numberOfJobManagers = 3;
final int numberOfTaskManagers = 3;
final int numberOfSlotsPerTaskManager = 2;
// The final count each source is counting to: 1...n
final int n = 5000;
// Parallelism for the program
final int parallelism = numberOfTaskManagers * numberOfSlotsPerTaskManager;
// The test should not run longer than this
final FiniteDuration testDuration = new FiniteDuration(10, TimeUnit.MINUTES);
// Every x seconds a random job or task manager is killed
// The job will will be running for $killEvery seconds and then a random Job/TaskManager
// will be killed. On recovery (which takes some time to bring up the new process etc.),
// this test will wait for task managers to reconnect before starting the next count down.
// Therefore the delay between retries is not important in this setup.
final FiniteDuration killEvery = new FiniteDuration(5, TimeUnit.SECONDS);
// Trigger a checkpoint every
final int checkpointingIntervalMs = 1000;
// Total number of kills
final int totalNumberOfKills = 10;
// -----------------------------------------------------------------------------------------
// Setup
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.toURI().toString());
// Akka and restart timeouts
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "1000 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "6 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 9);
if (checkpointingIntervalMs >= killEvery.toMillis()) {
throw new IllegalArgumentException("Relax! You want to kill processes every " + killEvery + ", but the checkpointing interval is " + checkpointingIntervalMs / 1000 + " seconds. Either decrease the interval or " + "increase the kill interval. Otherwise, the program will not complete any " + "checkpoint.");
// Task manager
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numberOfSlotsPerTaskManager);
ActorSystem testActorSystem = null;
LeaderRetrievalService leaderRetrievalService = null;
List<JobManagerProcess> jobManagerProcesses = new ArrayList<>();
List<TaskManagerProcess> taskManagerProcesses = new ArrayList<>();
try {
// Initial state
for (int i = 0; i < numberOfJobManagers; i++) {
for (int i = 0; i < numberOfTaskManagers; i++) {
testActorSystem = AkkaUtils.createDefaultActorSystem();
// Leader listener
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
TestingListener leaderListener = new TestingListener();
Deadline deadline = testDuration.fromNow();
// Wait for the new leader
int leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// The job
JobGraph jobGraph = createJobGraph(n, CheckpointCompletedCoordination.getPath(), ProceedCoordination.getPath(), parallelism, checkpointingIntervalMs);"Submitting job {}", jobGraph.getJobID());
submitJobGraph(jobGraph, jobManagerProcesses.get(leaderIndex), leaderListener, testActorSystem, deadline.timeLeft());"Waiting for a checkpoint to complete before kicking off chaos");
// Wait for a checkpoint to complete
TestJvmProcess.waitForMarkerFiles(FileStateBackendBasePath, COMPLETED_PREFIX, parallelism, deadline.timeLeft().toMillis());"Checkpoint completed... ready for chaos");
int currentKillNumber = 1;
int currentJobManagerKills = 0;
int currentTaskManagerKills = 0;
for (int i = 0; i < totalNumberOfKills; i++) {"Waiting for {} before next kill ({}/{})", killEvery, currentKillNumber++, totalNumberOfKills);
Thread.sleep(killEvery.toMillis());"Checking job status...");
JobStatus jobStatus = requestJobStatus(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
if (jobStatus != JobStatus.RUNNING && jobStatus != JobStatus.FINISHED) {
// Wait for it to run"Waiting for job status {}", JobStatus.RUNNING);
waitForJobRunning(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else if (jobStatus == JobStatus.FINISHED) {
// Early finish"Job finished");
} else {"Job status is {}", jobStatus);
if (rand.nextBoolean()) {"Killing the leading JobManager");
JobManagerProcess newJobManager = createAndStartJobManagerProcess(config);
JobManagerProcess leader = jobManagerProcesses.remove(leaderIndex);
currentJobManagerKills++;"Killed {}", leader);
// Make sure to add the new job manager before looking for a new leader
// Wait for the new leader
leaderIndex = waitForNewLeader(leaderListener, jobManagerProcesses, deadline.timeLeft());
// Wait for the task managers to connect
waitForTaskManagers(numberOfTaskManagers, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
} else {"Killing a random TaskManager");
TaskManagerProcess newTaskManager = createAndStartTaskManagerProcess(config);
// Wait for this new task manager to be connected
waitForTaskManagers(numberOfTaskManagers + 1, jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());
// Now it's safe to kill a process
int next = rand.nextInt(numberOfTaskManagers);
TaskManagerProcess taskManager = taskManagerProcesses.remove(next);"{} has been chosen. Killing process...", taskManager);
// Add the new task manager after killing an old one
}"Chaos is over. Total kills: {} ({} job manager + {} task managers). " + "Checking job status...", totalNumberOfKills, currentJobManagerKills, currentTaskManagerKills);
// Signal the job to speed up (if it is not done yet)
// Wait for the job to finish"Waiting for job status {}", JobStatus.FINISHED);
waitForJobFinished(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());"Job finished");"Waiting for job removal");
waitForJobRemoved(jobGraph.getJobID(), jobManagerProcesses.get(leaderIndex), testActorSystem, deadline.timeLeft());"Job removed");"Checking clean recovery state...");
checkCleanRecoveryState(config);"Recovery state clean");
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
System.out.println(" TASK MANAGERS");
for (TaskManagerProcess taskManagerProcess : taskManagerProcesses) {
System.out.println(" JOB MANAGERS");
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
throw t;
} finally {
for (JobManagerProcess jobManagerProcess : jobManagerProcesses) {
if (jobManagerProcess != null) {
if (leaderRetrievalService != null) {
if (testActorSystem != null) {
the class TaskManagerFailureRecoveryITCase method testRestartWithFailingTaskManager.
public void testRestartWithFailingTaskManager() {
final int PARALLELISM = 4;
LocalFlinkMiniCluster cluster = null;
ActorSystem additionalSystem = null;
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, PARALLELISM);
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "500 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "20 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 20);
cluster = new LocalFlinkMiniCluster(config, false);
// for the result
List<Long> resultCollection = new ArrayList<Long>();
final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 1000));
env.generateSequence(1, 10).map(new FailingMapper<Long>()).reduce(new ReduceFunction<Long>() {
public Long reduce(Long value1, Long value2) {
return value1 + value2;
}).output(new LocalCollectionOutputFormat<Long>(resultCollection));
// simple reference (atomic does not matter) to pass back an exception from the trigger thread
final AtomicReference<Throwable> ref = new AtomicReference<Throwable>();
// trigger the execution from a separate thread, so we are available to temper with the
// cluster during the execution
Thread trigger = new Thread("program trigger") {
public void run() {
try {
} catch (Throwable t) {
// the mappers in turn are waiting
for (int i = 0; i < PARALLELISM; i++) {
// bring up one more task manager and wait for it to appear
additionalSystem = cluster.startTaskManagerActorSystem(2);
ActorRef additionalTaskManager = cluster.startTaskManager(2, additionalSystem);
Object message = TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage();
Future<Object> future = Patterns.ask(additionalTaskManager, message, 30000);
try {
Await.result(future, new FiniteDuration(30000, TimeUnit.MILLISECONDS));
} catch (TimeoutException e) {
fail("The additional TaskManager did not come up within 30 seconds");
// kill the two other TaskManagers
for (ActorRef tm : cluster.getTaskManagersAsJava()) {
tm.tell(PoisonPill.getInstance(), null);
// wait for the next set of mappers (the recovery ones) to come online
for (int i = 0; i < PARALLELISM; i++) {
// tell the mappers that they may continue this time
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.COORD_TO_TASK_QUEUE.add(new Object());
// wait for the program to finish
if (ref.get() != null) {
Throwable t = ref.get();
fail("Program execution caused an exception: " + t.getMessage());
} catch (Exception e) {
} finally {
if (additionalSystem != null) {
if (cluster != null) {