Search in sources :

Example 16 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.

/**
	 * FLINK-5985
	 *
	 * This test ensures we can restore from a savepoint under modifications to the job graph that only concern
	 * stateless operators.
	 */
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
    // Config
    int numTaskManagers = 2;
    int numSlotsPerTaskManager = 2;
    int parallelism = 2;
    // Test deadline
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File tmpDir = CommonTestUtils.createTempDirectory();
    final File savepointDir = new File(tmpDir, "savepoints");
    TestingCluster flink = null;
    String savepointPath;
    try {
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());
        LOG.info("Flink configuration: " + config + ".");
        // Start Flink
        flink = new TestingCluster(config);
        LOG.info("Starting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        final StatefulCounter statefulCounter = new StatefulCounter();
        StatefulCounter.resetForTest(parallelism);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        env.addSource(new InfiniteTestSource()).shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 4 * value;
            }
        }).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 2 * value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
        JobSubmissionResult submissionResult = flink.submitJobDetached(originalJobGraph);
        JobID jobID = submissionResult.getJobID();
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
        savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
    try {
        LOG.info("Restarting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        // generate a modified job graph that adds a stateless op
        env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
        // Set the savepoint path
        modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
        // Submit the job
        flink.submitJobDetached(modifiedJobGraph);
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
}
Also used : RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) FileNotFoundException(java.io.FileNotFoundException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 17 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class ExecutionGraphRestartTest method testCancelWhileFailing.

@Test
public void testCancelWhileFailing() throws Exception {
    // We want to manually control the restart and delay
    RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
    Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(restartStrategy);
    ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
    Instance instance = executionGraphInstanceTuple.f1;
    doNothing().when(executionGraph).jobVertexInFinalState();
    // Kill the instance...
    instance.markDead();
    Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
    // ...and wait for all vertices to be in state FAILED. The
    // jobVertexInFinalState does nothing, that's why we don't wait on the
    // job status.
    boolean success = false;
    while (deadline.hasTimeLeft() && !success) {
        success = true;
        for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
            ExecutionState state = vertex.getExecutionState();
            if (state != ExecutionState.FAILED && state != ExecutionState.CANCELED) {
                success = false;
                Thread.sleep(100);
                break;
            }
        }
    }
    // Still in failing
    assertEquals(JobStatus.FAILING, executionGraph.getState());
    // The cancel call needs to change the state to CANCELLING
    executionGraph.cancel();
    assertEquals(JobStatus.CANCELLING, executionGraph.getState());
    // Unspy and finalize the job state
    doCallRealMethod().when(executionGraph).jobVertexInFinalState();
    executionGraph.jobVertexInFinalState();
    assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) Instance(org.apache.flink.runtime.instance.Instance) Deadline(scala.concurrent.duration.Deadline) FailureRateRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FailureRateRestartStrategy) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) RestartStrategy(org.apache.flink.runtime.executiongraph.restart.RestartStrategy) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Test(org.junit.Test)

Example 18 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class ExecutionGraphRestartTest method testCancelWhileRestarting.

@Test
public void testCancelWhileRestarting() throws Exception {
    // We want to manually control the restart and delay
    RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
    Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createExecutionGraph(restartStrategy);
    ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
    Instance instance = executionGraphInstanceTuple.f1;
    // Kill the instance and wait for the job to restart
    instance.markDead();
    Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
    while (deadline.hasTimeLeft() && executionGraph.getState() != JobStatus.RESTARTING) {
        Thread.sleep(100);
    }
    assertEquals(JobStatus.RESTARTING, executionGraph.getState());
    // Canceling needs to abort the restart
    executionGraph.cancel();
    assertEquals(JobStatus.CANCELED, executionGraph.getState());
    // The restart has been aborted
    executionGraph.restart();
    assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
Also used : InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) Instance(org.apache.flink.runtime.instance.Instance) Deadline(scala.concurrent.duration.Deadline) FailureRateRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FailureRateRestartStrategy) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) RestartStrategy(org.apache.flink.runtime.executiongraph.restart.RestartStrategy) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Test(org.junit.Test)

Example 19 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class JobClientActorRecoveryITCase method testJobClientRecovery.

/**
	 * Tests wether the JobClientActor can connect to a newly elected leading job manager to obtain
	 * the JobExecutionResult. The submitted job blocks for the first execution attempt. The
	 * leading job manager will be killed so that the second job manager will be elected as the
	 * leader. The newly elected leader has to retrieve the checkpointed job from ZooKeeper
	 * and continue its execution. This time, the job does not block and, thus, can be finished.
	 * The execution result should be sent to the JobClientActor which originally submitted the
	 * job.
	 *
	 * @throws Exception
	 */
@Test
public void testJobClientRecovery() throws Exception {
    File rootFolder = tempFolder.getRoot();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    final TestingCluster cluster = new TestingCluster(config);
    cluster.start();
    JobVertex blockingVertex = new JobVertex("Blocking Vertex");
    blockingVertex.setInvokableClass(BlockingTask.class);
    blockingVertex.setParallelism(1);
    final JobGraph jobGraph = new JobGraph("Blocking Test Job", blockingVertex);
    final Promise<JobExecutionResult> promise = new scala.concurrent.impl.Promise.DefaultPromise<>();
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    try {
        Thread submitter = new Thread(new Runnable() {

            @Override
            public void run() {
                try {
                    JobExecutionResult result = cluster.submitJobAndWait(jobGraph, false);
                    promise.success(result);
                } catch (Exception e) {
                    promise.failure(e);
                }
            }
        });
        submitter.start();
        synchronized (BlockingTask.waitLock) {
            while (BlockingTask.HasBlockedExecution < 1 && deadline.hasTimeLeft()) {
                BlockingTask.waitLock.wait(deadline.timeLeft().toMillis());
            }
        }
        if (deadline.isOverdue()) {
            Assert.fail("The job has not blocked within the given deadline.");
        }
        ActorGateway gateway = cluster.getLeaderGateway(deadline.timeLeft());
        gateway.tell(TestingJobManagerMessages.getDisablePostStop());
        gateway.tell(PoisonPill.getInstance());
        // if the job fails then an exception is thrown here
        Await.result(promise.future(), deadline.timeLeft());
    } finally {
        cluster.shutdown();
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobExecutionResult(org.apache.flink.api.common.JobExecutionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) File(java.io.File) Test(org.junit.Test)

Example 20 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class KvStateClientTest method testServerClosesChannel.

/**
	 * Tests that a server channel close, closes the connection and removes it
	 * from the established connections.
	 */
@Test
public void testServerClosesChannel() throws Exception {
    Deadline deadline = TEST_TIMEOUT.fromNow();
    AtomicKvStateRequestStats stats = new AtomicKvStateRequestStats();
    KvStateClient client = null;
    Channel serverChannel = null;
    try {
        client = new KvStateClient(1, stats);
        final AtomicBoolean received = new AtomicBoolean();
        final AtomicReference<Channel> channel = new AtomicReference<>();
        serverChannel = createServerChannel(new ChannelInboundHandlerAdapter() {

            @Override
            public void channelActive(ChannelHandlerContext ctx) throws Exception {
                channel.set(ctx.channel());
            }

            @Override
            public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
                received.set(true);
            }
        });
        KvStateServerAddress serverAddress = getKvStateServerAddress(serverChannel);
        // Requests
        Future<byte[]> future = client.getKvState(serverAddress, new KvStateID(), new byte[0]);
        while (!received.get() && deadline.hasTimeLeft()) {
            Thread.sleep(50);
        }
        assertTrue("Receive timed out", received.get());
        assertEquals(1, stats.getNumConnections());
        channel.get().close().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        try {
            Await.result(future, deadline.timeLeft());
            fail("Did not throw expected server failure");
        } catch (ClosedChannelException ignored) {
        // Expected
        }
        assertEquals(0, stats.getNumConnections());
        // Counts can take some time to propagate
        while (deadline.hasTimeLeft() && (stats.getNumSuccessful() != 0 || stats.getNumFailed() != 1)) {
            Thread.sleep(100);
        }
        assertEquals(1, stats.getNumRequests());
        assertEquals(0, stats.getNumSuccessful());
        assertEquals(1, stats.getNumFailed());
    } finally {
        if (client != null) {
            client.shutDown();
        }
        if (serverChannel != null) {
            serverChannel.close();
        }
        assertEquals("Channel leak", 0, stats.getNumConnections());
    }
}
Also used : ClosedChannelException(java.nio.channels.ClosedChannelException) Deadline(scala.concurrent.duration.Deadline) SocketChannel(io.netty.channel.socket.SocketChannel) NioServerSocketChannel(io.netty.channel.socket.nio.NioServerSocketChannel) Channel(io.netty.channel.Channel) KvStateServerAddress(org.apache.flink.runtime.query.KvStateServerAddress) AtomicReference(java.util.concurrent.atomic.AtomicReference) ChannelHandlerContext(io.netty.channel.ChannelHandlerContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) KvStateID(org.apache.flink.runtime.query.KvStateID) ChannelInboundHandlerAdapter(io.netty.channel.ChannelInboundHandlerAdapter) Test(org.junit.Test)

Aggregations

Deadline (scala.concurrent.duration.Deadline)59 Test (org.junit.Test)50 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)31 Configuration (org.apache.flink.configuration.Configuration)28 FiniteDuration (scala.concurrent.duration.FiniteDuration)24 JobID (org.apache.flink.api.common.JobID)21 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)20 ActorRef (akka.actor.ActorRef)12 File (java.io.File)12 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)12 TestingCluster (org.apache.flink.runtime.testingUtils.TestingCluster)12 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)10 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)10 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)10 ActorSystem (akka.actor.ActorSystem)9 ArrayList (java.util.ArrayList)9 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)9 UUID (java.util.UUID)8 AtomicLong (java.util.concurrent.atomic.AtomicLong)8 KeySelector (org.apache.flink.api.java.functions.KeySelector)8