Search in sources :

Example 6 with Future

use of scala.concurrent.Future in project flink by apache.

the class KvStateClientTest method testFailureClosesChannel.

/**
	 * Tests that a server failure closes the connection and removes it from
	 * the established connections.
	 */
@Test
public void testFailureClosesChannel() throws Exception {
    Deadline deadline = TEST_TIMEOUT.fromNow();
    AtomicKvStateRequestStats stats = new AtomicKvStateRequestStats();
    KvStateClient client = null;
    Channel serverChannel = null;
    try {
        client = new KvStateClient(1, stats);
        final LinkedBlockingQueue<ByteBuf> received = new LinkedBlockingQueue<>();
        final AtomicReference<Channel> channel = new AtomicReference<>();
        serverChannel = createServerChannel(new ChannelInboundHandlerAdapter() {

            @Override
            public void channelActive(ChannelHandlerContext ctx) throws Exception {
                channel.set(ctx.channel());
            }

            @Override
            public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
                received.add((ByteBuf) msg);
            }
        });
        KvStateServerAddress serverAddress = getKvStateServerAddress(serverChannel);
        // Requests
        List<Future<byte[]>> futures = new ArrayList<>();
        futures.add(client.getKvState(serverAddress, new KvStateID(), new byte[0]));
        futures.add(client.getKvState(serverAddress, new KvStateID(), new byte[0]));
        ByteBuf buf = received.poll(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        assertNotNull("Receive timed out", buf);
        buf.release();
        buf = received.poll(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        assertNotNull("Receive timed out", buf);
        buf.release();
        assertEquals(1, stats.getNumConnections());
        Channel ch = channel.get();
        assertNotNull("Channel not active", ch);
        // Respond with failure
        ch.writeAndFlush(KvStateRequestSerializer.serializeServerFailure(serverChannel.alloc(), new RuntimeException("Expected test server failure")));
        try {
            Await.result(futures.remove(0), deadline.timeLeft());
            fail("Did not throw expected server failure");
        } catch (RuntimeException ignored) {
        // Expected
        }
        try {
            Await.result(futures.remove(0), deadline.timeLeft());
            fail("Did not throw expected server failure");
        } catch (RuntimeException ignored) {
        // Expected
        }
        assertEquals(0, stats.getNumConnections());
        // Counts can take some time to propagate
        while (deadline.hasTimeLeft() && (stats.getNumSuccessful() != 0 || stats.getNumFailed() != 2)) {
            Thread.sleep(100);
        }
        assertEquals(2, stats.getNumRequests());
        assertEquals(0, stats.getNumSuccessful());
        assertEquals(2, stats.getNumFailed());
    } finally {
        if (client != null) {
            client.shutDown();
        }
        if (serverChannel != null) {
            serverChannel.close();
        }
        assertEquals("Channel leak", 0, stats.getNumConnections());
    }
}
Also used : Deadline(scala.concurrent.duration.Deadline) SocketChannel(io.netty.channel.socket.SocketChannel) NioServerSocketChannel(io.netty.channel.socket.nio.NioServerSocketChannel) Channel(io.netty.channel.Channel) ArrayList(java.util.ArrayList) KvStateServerAddress(org.apache.flink.runtime.query.KvStateServerAddress) AtomicReference(java.util.concurrent.atomic.AtomicReference) ChannelHandlerContext(io.netty.channel.ChannelHandlerContext) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) ByteBuf(io.netty.buffer.ByteBuf) Future(scala.concurrent.Future) KvStateID(org.apache.flink.runtime.query.KvStateID) ChannelInboundHandlerAdapter(io.netty.channel.ChannelInboundHandlerAdapter) Test(org.junit.Test)

Example 7 with Future

use of scala.concurrent.Future in project flink by apache.

the class TestBaseUtils method stopCluster.

public static void stopCluster(LocalFlinkMiniCluster executor, FiniteDuration timeout) throws Exception {
    if (logDir != null) {
        FileUtils.deleteDirectory(logDir);
    }
    if (executor != null) {
        int numUnreleasedBCVars = 0;
        int numActiveConnections = 0;
        if (executor.running()) {
            List<ActorRef> tms = executor.getTaskManagersAsJava();
            List<Future<Object>> bcVariableManagerResponseFutures = new ArrayList<>();
            List<Future<Object>> numActiveConnectionsResponseFutures = new ArrayList<>();
            for (ActorRef tm : tms) {
                bcVariableManagerResponseFutures.add(Patterns.ask(tm, TaskManagerMessages.getRequestBroadcastVariablesWithReferences(), new Timeout(timeout)));
                numActiveConnectionsResponseFutures.add(Patterns.ask(tm, TaskManagerMessages.getRequestNumActiveConnections(), new Timeout(timeout)));
            }
            Future<Iterable<Object>> bcVariableManagerFutureResponses = Futures.sequence(bcVariableManagerResponseFutures, defaultExecutionContext());
            Iterable<Object> responses = Await.result(bcVariableManagerFutureResponses, timeout);
            for (Object response : responses) {
                numUnreleasedBCVars += ((TaskManagerMessages.ResponseBroadcastVariablesWithReferences) response).number();
            }
            Future<Iterable<Object>> numActiveConnectionsFutureResponses = Futures.sequence(numActiveConnectionsResponseFutures, defaultExecutionContext());
            responses = Await.result(numActiveConnectionsFutureResponses, timeout);
            for (Object response : responses) {
                numActiveConnections += ((TaskManagerMessages.ResponseNumActiveConnections) response).number();
            }
        }
        executor.stop();
        FileSystem.closeAll();
        System.gc();
        Assert.assertEquals("Not all broadcast variables were released.", 0, numUnreleasedBCVars);
        Assert.assertEquals("Not all TCP connections were released.", 0, numActiveConnections);
    }
}
Also used : TaskManagerMessages(org.apache.flink.runtime.messages.TaskManagerMessages) ActorRef(akka.actor.ActorRef) Timeout(akka.util.Timeout) ArrayList(java.util.ArrayList) Future(scala.concurrent.Future)

Example 8 with Future

use of scala.concurrent.Future in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testClientNonDetachedListeningBehaviour.

/**
	 * Tests that clients receive updates after recovery by a new leader.
	 */
@Test
public void testClientNonDetachedListeningBehaviour() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Test actor system
    ActorSystem testSystem = null;
    // JobManager setup. Start the job managers as separate processes in order to not run the
    // actors postStop, which cleans up all running jobs.
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(AkkaUtils.getDefaultAkkaConfig());
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Client test actor
        TestActorRef<RecordingTestClient> clientRef = TestActorRef.create(testSystem, Props.create(RecordingTestClient.class));
        JobGraph jobGraph = createBlockingJobGraph();
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // The client
            AkkaActorGateway client = new AkkaActorGateway(clientRef, leaderId);
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            int numSlots = 0;
            while (numSlots == 0) {
                Future<?> slotsFuture = leader.ask(JobManagerMessages.getRequestTotalNumberOfSlots(), deadline.timeLeft());
                numSlots = (Integer) Await.result(slotsFuture, deadline.timeLeft());
            }
            // Submit the job in non-detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES), client);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
            // Cancel the job
            leader.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        }
        // Wait for the execution result
        clientRef.underlyingActor().awaitJobResult(deadline.timeLeft().toMillis());
        int jobSubmitSuccessMessages = 0;
        for (Object msg : clientRef.underlyingActor().getMessages()) {
            if (msg instanceof JobManagerMessages.JobSubmitSuccess) {
                jobSubmitSuccessMessages++;
            }
        }
        // At least two submissions should be ack-ed (initial and recovery). This is quite
        // conservative, but it is still possible that these messages are overtaken by the
        // final message.
        assertEquals(2, jobSubmitSuccessMessages);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) Future(scala.concurrent.Future) Test(org.junit.Test)

Example 9 with Future

use of scala.concurrent.Future in project flink by apache.

the class AbstractQueryableStateITCase method testQueryableState.

/**
	 * Runs a simple topology producing random (key, 1) pairs at the sources (where
	 * number of keys is in fixed in range 0...numKeys). The records are keyed and
	 * a reducing queryable state instance is created, which sums up the records.
	 *
	 * After submitting the job in detached mode, the QueryableStateCLient is used
	 * to query the counts of each key in rounds until all keys have non-zero counts.
	 */
@Test
@SuppressWarnings("unchecked")
public void testQueryableState() throws Exception {
    // Config
    final Deadline deadline = TEST_TIMEOUT.fromNow();
    final int numKeys = 256;
    final QueryableStateClient client = new QueryableStateClient(cluster.configuration());
    JobID jobId = null;
    try {
        //
        // Test program
        //
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStateBackend(stateBackend);
        env.setParallelism(NUM_SLOTS);
        // Very important, because cluster is shared between tests and we
        // don't explicitly check that all slots are available before
        // submitting.
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 1000));
        DataStream<Tuple2<Integer, Long>> source = env.addSource(new TestKeyRangeSource(numKeys));
        // Reducing state
        ReducingStateDescriptor<Tuple2<Integer, Long>> reducingState = new ReducingStateDescriptor<>("any-name", new SumReduce(), source.getType());
        final String queryName = "hakuna-matata";
        final QueryableStateStream<Integer, Tuple2<Integer, Long>> queryableState = source.keyBy(new KeySelector<Tuple2<Integer, Long>, Integer>() {

            @Override
            public Integer getKey(Tuple2<Integer, Long> value) throws Exception {
                return value.f0;
            }
        }).asQueryableState(queryName, reducingState);
        // Submit the job graph
        JobGraph jobGraph = env.getStreamGraph().getJobGraph();
        cluster.submitJobDetached(jobGraph);
        //
        // Start querying
        //
        jobId = jobGraph.getJobID();
        final AtomicLongArray counts = new AtomicLongArray(numKeys);
        boolean allNonZero = false;
        while (!allNonZero && deadline.hasTimeLeft()) {
            allNonZero = true;
            final List<Future<byte[]>> futures = new ArrayList<>(numKeys);
            for (int i = 0; i < numKeys; i++) {
                final int key = i;
                if (counts.get(key) > 0) {
                    // Skip this one
                    continue;
                } else {
                    allNonZero = false;
                }
                final byte[] serializedKey = KvStateRequestSerializer.serializeKeyAndNamespace(key, queryableState.getKeySerializer(), VoidNamespace.INSTANCE, VoidNamespaceSerializer.INSTANCE);
                Future<byte[]> serializedResult = getKvStateWithRetries(client, jobId, queryName, key, serializedKey, QUERY_RETRY_DELAY, false);
                serializedResult.onSuccess(new OnSuccess<byte[]>() {

                    @Override
                    public void onSuccess(byte[] result) throws Throwable {
                        Tuple2<Integer, Long> value = KvStateRequestSerializer.deserializeValue(result, queryableState.getValueSerializer());
                        counts.set(key, value.f1);
                        assertEquals("Key mismatch", key, value.f0.intValue());
                    }
                }, TEST_ACTOR_SYSTEM.dispatcher());
                futures.add(serializedResult);
            }
            Future<Iterable<byte[]>> futureSequence = Futures.sequence(futures, TEST_ACTOR_SYSTEM.dispatcher());
            Await.ready(futureSequence, deadline.timeLeft());
        }
        assertTrue("Not all keys are non-zero", allNonZero);
        // All should be non-zero
        for (int i = 0; i < numKeys; i++) {
            long count = counts.get(i);
            assertTrue("Count at position " + i + " is " + count, count > 0);
        }
    } finally {
        // Free cluster resources
        if (jobId != null) {
            Future<CancellationSuccess> cancellation = cluster.getLeaderGateway(deadline.timeLeft()).ask(new JobManagerMessages.CancelJob(jobId), deadline.timeLeft()).mapTo(ClassTag$.MODULE$.<CancellationSuccess>apply(CancellationSuccess.class));
            Await.ready(cancellation, deadline.timeLeft());
        }
        client.shutDown();
    }
}
Also used : ArrayList(java.util.ArrayList) QueryableStateClient(org.apache.flink.runtime.query.QueryableStateClient) KeySelector(org.apache.flink.api.java.functions.KeySelector) ReducingStateDescriptor(org.apache.flink.api.common.state.ReducingStateDescriptor) Deadline(scala.concurrent.duration.Deadline) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Tuple2(org.apache.flink.api.java.tuple.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) Future(scala.concurrent.Future) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 10 with Future

use of scala.concurrent.Future in project flink by apache.

the class TaskManagerFailureRecoveryITCase method testRestartWithFailingTaskManager.

@Test
public void testRestartWithFailingTaskManager() {
    final int PARALLELISM = 4;
    LocalFlinkMiniCluster cluster = null;
    ActorSystem additionalSystem = null;
    try {
        Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 2);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, PARALLELISM);
        config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, 16);
        config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "500 ms");
        config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "20 s");
        config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 20);
        cluster = new LocalFlinkMiniCluster(config, false);
        cluster.start();
        // for the result
        List<Long> resultCollection = new ArrayList<Long>();
        final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", cluster.getLeaderRPCPort());
        env.setParallelism(PARALLELISM);
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 1000));
        env.getConfig().disableSysoutLogging();
        env.generateSequence(1, 10).map(new FailingMapper<Long>()).reduce(new ReduceFunction<Long>() {

            @Override
            public Long reduce(Long value1, Long value2) {
                return value1 + value2;
            }
        }).output(new LocalCollectionOutputFormat<Long>(resultCollection));
        // simple reference (atomic does not matter) to pass back an exception from the trigger thread
        final AtomicReference<Throwable> ref = new AtomicReference<Throwable>();
        // trigger the execution from a separate thread, so we are available to temper with the
        // cluster during the execution
        Thread trigger = new Thread("program trigger") {

            @Override
            public void run() {
                try {
                    env.execute();
                } catch (Throwable t) {
                    ref.set(t);
                }
            }
        };
        trigger.setDaemon(true);
        trigger.start();
        // the mappers in turn are waiting
        for (int i = 0; i < PARALLELISM; i++) {
            FailingMapper.TASK_TO_COORD_QUEUE.take();
        }
        // bring up one more task manager and wait for it to appear
        {
            additionalSystem = cluster.startTaskManagerActorSystem(2);
            ActorRef additionalTaskManager = cluster.startTaskManager(2, additionalSystem);
            Object message = TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage();
            Future<Object> future = Patterns.ask(additionalTaskManager, message, 30000);
            try {
                Await.result(future, new FiniteDuration(30000, TimeUnit.MILLISECONDS));
            } catch (TimeoutException e) {
                fail("The additional TaskManager did not come up within 30 seconds");
            }
        }
        // kill the two other TaskManagers
        for (ActorRef tm : cluster.getTaskManagersAsJava()) {
            tm.tell(PoisonPill.getInstance(), null);
        }
        // wait for the next set of mappers (the recovery ones) to come online
        for (int i = 0; i < PARALLELISM; i++) {
            FailingMapper.TASK_TO_COORD_QUEUE.take();
        }
        // tell the mappers that they may continue this time
        for (int i = 0; i < PARALLELISM; i++) {
            FailingMapper.COORD_TO_TASK_QUEUE.add(new Object());
        }
        // wait for the program to finish
        trigger.join();
        if (ref.get() != null) {
            Throwable t = ref.get();
            t.printStackTrace();
            fail("Program execution caused an exception: " + t.getMessage());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (additionalSystem != null) {
            additionalSystem.shutdown();
        }
        if (cluster != null) {
            cluster.stop();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) ArrayList(java.util.ArrayList) ReduceFunction(org.apache.flink.api.common.functions.ReduceFunction) FiniteDuration(scala.concurrent.duration.FiniteDuration) AtomicReference(java.util.concurrent.atomic.AtomicReference) TimeoutException(java.util.concurrent.TimeoutException) LocalFlinkMiniCluster(org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster) Future(scala.concurrent.Future) TimeoutException(java.util.concurrent.TimeoutException) Test(org.junit.Test)

Aggregations

Future (scala.concurrent.Future)10 Test (org.junit.Test)9 ArrayList (java.util.ArrayList)8 Deadline (scala.concurrent.duration.Deadline)5 ActorRef (akka.actor.ActorRef)4 JobID (org.apache.flink.api.common.JobID)4 KvStateID (org.apache.flink.runtime.query.KvStateID)4 ByteBuf (io.netty.buffer.ByteBuf)3 Channel (io.netty.channel.Channel)3 ChannelHandlerContext (io.netty.channel.ChannelHandlerContext)3 ChannelInboundHandlerAdapter (io.netty.channel.ChannelInboundHandlerAdapter)3 SocketChannel (io.netty.channel.socket.SocketChannel)3 NioServerSocketChannel (io.netty.channel.socket.nio.NioServerSocketChannel)3 AtomicReference (java.util.concurrent.atomic.AtomicReference)3 Configuration (org.apache.flink.configuration.Configuration)3 KvStateServerAddress (org.apache.flink.runtime.query.KvStateServerAddress)3 ActorSystem (akka.actor.ActorSystem)2 ClosedChannelException (java.nio.channels.ClosedChannelException)2 Callable (java.util.concurrent.Callable)2 ExecutionException (java.util.concurrent.ExecutionException)2