use of scala.concurrent.Future in project flink by apache.
the class KvStateClientTest method testFailureClosesChannel.
/**
* Tests that a server failure closes the connection and removes it from
* the established connections.
*/
@Test
public void testFailureClosesChannel() throws Exception {
Deadline deadline = TEST_TIMEOUT.fromNow();
AtomicKvStateRequestStats stats = new AtomicKvStateRequestStats();
KvStateClient client = null;
Channel serverChannel = null;
try {
client = new KvStateClient(1, stats);
final LinkedBlockingQueue<ByteBuf> received = new LinkedBlockingQueue<>();
final AtomicReference<Channel> channel = new AtomicReference<>();
serverChannel = createServerChannel(new ChannelInboundHandlerAdapter() {
@Override
public void channelActive(ChannelHandlerContext ctx) throws Exception {
channel.set(ctx.channel());
}
@Override
public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
received.add((ByteBuf) msg);
}
});
KvStateServerAddress serverAddress = getKvStateServerAddress(serverChannel);
// Requests
List<Future<byte[]>> futures = new ArrayList<>();
futures.add(client.getKvState(serverAddress, new KvStateID(), new byte[0]));
futures.add(client.getKvState(serverAddress, new KvStateID(), new byte[0]));
ByteBuf buf = received.poll(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
assertNotNull("Receive timed out", buf);
buf.release();
buf = received.poll(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
assertNotNull("Receive timed out", buf);
buf.release();
assertEquals(1, stats.getNumConnections());
Channel ch = channel.get();
assertNotNull("Channel not active", ch);
// Respond with failure
ch.writeAndFlush(KvStateRequestSerializer.serializeServerFailure(serverChannel.alloc(), new RuntimeException("Expected test server failure")));
try {
Await.result(futures.remove(0), deadline.timeLeft());
fail("Did not throw expected server failure");
} catch (RuntimeException ignored) {
// Expected
}
try {
Await.result(futures.remove(0), deadline.timeLeft());
fail("Did not throw expected server failure");
} catch (RuntimeException ignored) {
// Expected
}
assertEquals(0, stats.getNumConnections());
// Counts can take some time to propagate
while (deadline.hasTimeLeft() && (stats.getNumSuccessful() != 0 || stats.getNumFailed() != 2)) {
Thread.sleep(100);
}
assertEquals(2, stats.getNumRequests());
assertEquals(0, stats.getNumSuccessful());
assertEquals(2, stats.getNumFailed());
} finally {
if (client != null) {
client.shutDown();
}
if (serverChannel != null) {
serverChannel.close();
}
assertEquals("Channel leak", 0, stats.getNumConnections());
}
}
use of scala.concurrent.Future in project flink by apache.
the class TestBaseUtils method stopCluster.
public static void stopCluster(LocalFlinkMiniCluster executor, FiniteDuration timeout) throws Exception {
if (logDir != null) {
FileUtils.deleteDirectory(logDir);
}
if (executor != null) {
int numUnreleasedBCVars = 0;
int numActiveConnections = 0;
if (executor.running()) {
List<ActorRef> tms = executor.getTaskManagersAsJava();
List<Future<Object>> bcVariableManagerResponseFutures = new ArrayList<>();
List<Future<Object>> numActiveConnectionsResponseFutures = new ArrayList<>();
for (ActorRef tm : tms) {
bcVariableManagerResponseFutures.add(Patterns.ask(tm, TaskManagerMessages.getRequestBroadcastVariablesWithReferences(), new Timeout(timeout)));
numActiveConnectionsResponseFutures.add(Patterns.ask(tm, TaskManagerMessages.getRequestNumActiveConnections(), new Timeout(timeout)));
}
Future<Iterable<Object>> bcVariableManagerFutureResponses = Futures.sequence(bcVariableManagerResponseFutures, defaultExecutionContext());
Iterable<Object> responses = Await.result(bcVariableManagerFutureResponses, timeout);
for (Object response : responses) {
numUnreleasedBCVars += ((TaskManagerMessages.ResponseBroadcastVariablesWithReferences) response).number();
}
Future<Iterable<Object>> numActiveConnectionsFutureResponses = Futures.sequence(numActiveConnectionsResponseFutures, defaultExecutionContext());
responses = Await.result(numActiveConnectionsFutureResponses, timeout);
for (Object response : responses) {
numActiveConnections += ((TaskManagerMessages.ResponseNumActiveConnections) response).number();
}
}
executor.stop();
FileSystem.closeAll();
System.gc();
Assert.assertEquals("Not all broadcast variables were released.", 0, numUnreleasedBCVars);
Assert.assertEquals("Not all TCP connections were released.", 0, numActiveConnections);
}
}
use of scala.concurrent.Future in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.
/**
* Tests that clients receive updates after recovery by a new leader.
*/
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Test actor system
ActorSystem testSystem = null;
// JobManager setup. Start the job managers as separate processes in order to not run the
// actors postStop, which cleans up all running jobs.
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Client test actor
TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
JobGraph jobGraph = createBlockingJobGraph();
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// The client
AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
int numSlots = 0;
while (numSlots == 0) {
Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
}
// Submit the job in non-detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
// Cancel the job
leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
}
// Wait for the execution result
clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
int jobSubmitSuccessMessages = 0;
for (Object msg : clientRef.underlyingActor().getMessages()) {
if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
jobSubmitSuccessMessages++;
}
}
// At least two submissions should be ack-ed (initial and recovery). This is quite
// conservative, but it is still possible that these messages are overtaken by the
// final message.
assertEquals(2, jobSubmitSuccessMessages);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of scala.concurrent.Future in project flink by apache.
the class AbstractQueryableStateITCase method testQueryableState.
/**
* Runs a simple topology producing random (key, 1) pairs at the sources (where
* number of keys is in fixed in range 0...numKeys). The records are keyed and
* a reducing queryable state instance is created, which sums up the records.
*
* After submitting the job in detached mode, the QueryableStateCLient is used
* to query the counts of each key in rounds until all keys have non-zero counts.
*/
@Test
@SuppressWarnings("unchecked")
public void testQueryableState() throws Exception {
// Config
final Deadline deadline = TEST_TIMEOUT.fromNow();
final int numKeys = 256;
final QueryableStateClient client = new QueryableStateClient(cluster.configuration());
JobID jobId = null;
try {
//
// Test program
//
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(stateBackend);
env.setParallelism(NUM_SLOTS);
// Very important, because cluster is shared between tests and we
// don't explicitly check that all slots are available before
// submitting.
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 1000));
DataStream<Tuple2<Integer, Long>> source = env.addSource(new TestKeyRangeSource(numKeys));
// Reducing state
ReducingStateDescriptor<Tuple2<Integer, Long>> reducingState = new ReducingStateDescriptor<>("any-name", new SumReduce(), source.getType());
final String queryName = "hakuna-matata";
final QueryableStateStream<Integer, Tuple2<Integer, Long>> queryableState = source.keyBy(new KeySelector<Tuple2<Integer, Long>, Integer>() {
@Override
public Integer getKey(Tuple2<Integer, Long> value) throws Exception {
return value.f0;
}
}).asQueryableState(queryName, reducingState);
// Submit the job graph
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
cluster.submitJobDetached(jobGraph);
//
// Start querying
//
jobId = jobGraph.getJobID();
final AtomicLongArray counts = new AtomicLongArray(numKeys);
boolean allNonZero = false;
while (!allNonZero && deadline.hasTimeLeft()) {
allNonZero = true;
final List<Future<byte[]>> futures = new ArrayList<>(numKeys);
for (int i = 0; i < numKeys; i++) {
final int key = i;
if (counts.get(key) > 0) {
// Skip this one
continue;
} else {
allNonZero = false;
}
final byte[] serializedKey = KvStateRequestSerializer.serializeKeyAndNamespace(key, queryableState.getKeySerializer(), VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE);
Future<byte[]> serializedResult = getKvStateWithRetries(client, jobId, queryName, key, serializedKey, QUERY_RETRY_DELAY, false);
serializedResult.onSuccess(new OnSuccess<byte[]>() {
@Override
public void onSuccess(byte[] result) throws Throwable {
Tuple2<Integer, Long> value = KvStateRequestSerializer.deserializeValue(result, queryableState.getValueSerializer());
counts.set(key, value.f1);
assertEquals("Key mismatch", key, value.f0.intValue());
}
}, TEST_ACTOR_SYSTEM.dispatcher());
futures.add(serializedResult);
}
Future<Iterable<byte[]>> futureSequence = Futures.sequence(futures, TEST_ACTOR_SYSTEM.dispatcher());
Await.ready(futureSequence, deadline.timeLeft());
}
assertTrue("Not all keys are non-zero", allNonZero);
// All should be non-zero
for (int i = 0; i < numKeys; i++) {
long count = counts.get(i);
assertTrue("Count at position " + i + " is " + count, count > 0);
}
} finally {
// Free cluster resources
if (jobId != null) {
Future<CancellationSuccess> cancellation = cluster.getLeaderGateway(deadline.timeLeft()).ask(new JobManagerMessages.CancelJob(jobId), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<CancellationSuccess>apply(CancellationSuccess.class));
Await.ready(cancellation, deadline.timeLeft());
}
client.shutDown();
}
}
use of scala.concurrent.Future in project flink by apache.
the class TaskManagerFailureRecoveryITCase method testRestartWithFailingTaskManager.
@Test
public void testRestartWithFailingTaskManager() {
final int PARALLELISM = 4;
LocalFlinkMiniCluster cluster = null;
ActorSystem additionalSystem = null;
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, PARALLELISM);
config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "500 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "20 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 20);
cluster = new LocalFlinkMiniCluster(config, false);
cluster.start();
// for the result
List<Long> resultCollection = new ArrayList<Long>();
final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
env.setParallelism(PARALLELISM);
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 1000));
env.getConfig().disableSysoutLogging();
env.generateSequence(1, 10).map(new FailingMapper<Long>()).reduce(new ReduceFunction<Long>() {
@Override
public Long reduce(Long value1, Long value2) {
return value1 + value2;
}
}).output(new LocalCollectionOutputFormat<Long>(resultCollection));
// simple reference (atomic does not matter) to pass back an exception from the trigger thread
final AtomicReference<Throwable> ref = new AtomicReference<Throwable>();
// trigger the execution from a separate thread, so we are available to temper with the
// cluster during the execution
Thread trigger = new Thread("program trigger") {
@Override
public void run() {
try {
env.execute();
} catch (Throwable t) {
ref.set(t);
}
}
};
trigger.setDaemon(true);
trigger.start();
// the mappers in turn are waiting
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.TASK_TO_COORD_QUEUE.take();
}
// bring up one more task manager and wait for it to appear
{
additionalSystem = cluster.startTaskManagerActorSystem(2);
ActorRef additionalTaskManager = cluster.startTaskManager(2, additionalSystem);
Object message = TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage();
Future<Object> future = Patterns.ask(additionalTaskManager, message, 30000);
try {
Await.result(future, new FiniteDuration(30000, TimeUnit.MILLISECONDS));
} catch (TimeoutException e) {
fail("The additional TaskManager did not come up within 30 seconds");
}
}
// kill the two other TaskManagers
for (ActorRef tm : cluster.getTaskManagersAsJava()) {
tm.tell(PoisonPill.getInstance(), null);
}
// wait for the next set of mappers (the recovery ones) to come online
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.TASK_TO_COORD_QUEUE.take();
}
// tell the mappers that they may continue this time
for (int i = 0; i < PARALLELISM; i++) {
FailingMapper.COORD_TO_TASK_QUEUE.add(new Object());
}
// wait for the program to finish
trigger.join();
if (ref.get() != null) {
Throwable t = ref.get();
t.printStackTrace();
fail("Program execution caused an exception: " + t.getMessage());
}
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (additionalSystem != null) {
additionalSystem.shutdown();
}
if (cluster != null) {
cluster.stop();
}
}
}
Aggregations