Search in sources :

Example 1 with WaitForAllVerticesToBeRunning

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning in project flink by apache.

the class BackPressureStatsTrackerITCase method testBackPressuredProducer.

/**
	 * Tests a simple fake-back pressured task. Back pressure is assumed when
	 * sampled stack traces are in blocking buffer requests.
	 */
@Test
public void testBackPressuredProducer() throws Exception {
    new JavaTestKit(testActorSystem) {

        {
            final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
            // The JobGraph
            final JobGraph jobGraph = new JobGraph();
            final int parallelism = 4;
            final JobVertex task = new JobVertex("Task");
            task.setInvokableClass(BackPressuredTask.class);
            task.setParallelism(parallelism);
            jobGraph.addVertex(task);
            ActorGateway jobManger = null;
            ActorGateway taskManager = null;
            //
            // 1) Consume all buffers at first (no buffers for the test task)
            //
            testBufferPool = networkBufferPool.createBufferPool(1, Integer.MAX_VALUE);
            final List<Buffer> buffers = new ArrayList<>();
            while (true) {
                Buffer buffer = testBufferPool.requestBuffer();
                if (buffer != null) {
                    buffers.add(buffer);
                } else {
                    break;
                }
            }
            try {
                jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
                final Configuration config = new Configuration();
                config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
                taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
                final ActorGateway jm = jobManger;
                new Within(deadline) {

                    @Override
                    protected void run() {
                        try {
                            ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
                            // Submit the job and wait until it is running
                            JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
                            jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
                            expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
                            // Get the ExecutionGraph
                            jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
                            ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
                            ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
                            ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
                            StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
                            // Verify back pressure (clean up interval can be ignored)
                            BackPressureStatsTracker statsTracker = new BackPressureStatsTracker(coordinator, 100 * 1000, 20, Time.milliseconds(10L));
                            int numAttempts = 10;
                            int nextSampleId = 0;
                            // the buffer.
                            for (int attempt = 0; attempt < numAttempts; attempt++) {
                                try {
                                    OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
                                    assertEquals(nextSampleId + attempt, stats.getSampleId());
                                    assertEquals(parallelism, stats.getNumberOfSubTasks());
                                    assertEquals(1.0, stats.getMaxBackPressureRatio(), 0.0);
                                    for (int i = 0; i < parallelism; i++) {
                                        assertEquals(1.0, stats.getBackPressureRatio(i), 0.0);
                                    }
                                    nextSampleId = stats.getSampleId() + 1;
                                    break;
                                } catch (Throwable t) {
                                    if (attempt == numAttempts - 1) {
                                        throw t;
                                    } else {
                                        Thread.sleep(500);
                                    }
                                }
                            }
                            //
                            for (Buffer buf : buffers) {
                                buf.recycle();
                            }
                            // grab them and then immediately release them.
                            while (testBufferPool.getNumberOfAvailableMemorySegments() < 100) {
                                Thread.sleep(100);
                            }
                            // Verify that no task is back pressured any more.
                            for (int attempt = 0; attempt < numAttempts; attempt++) {
                                try {
                                    OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
                                    assertEquals(nextSampleId + attempt, stats.getSampleId());
                                    assertEquals(parallelism, stats.getNumberOfSubTasks());
                                    // Verify that no task is back pressured
                                    for (int i = 0; i < parallelism; i++) {
                                        assertEquals(0.0, stats.getBackPressureRatio(i), 0.0);
                                    }
                                    break;
                                } catch (Throwable t) {
                                    if (attempt == numAttempts - 1) {
                                        throw t;
                                    } else {
                                        Thread.sleep(500);
                                    }
                                }
                            }
                            // Shut down
                            jm.tell(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), testActor);
                            // Cancel job
                            jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
                            // Response to removal notification
                            expectMsgEquals(true);
                            //
                            // 3) Trigger stats for archived job
                            //
                            statsTracker.invalidateOperatorStatsCache();
                            assertFalse("Unexpected trigger", statsTracker.triggerStackTraceSample(vertex));
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(jobManger);
                TestingUtils.stopActor(taskManager);
                for (Buffer buf : buffers) {
                    buf.recycle();
                }
                testBufferPool.lazyDestroy();
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ArrayList(java.util.ArrayList) AllVerticesRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.AllVerticesRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) ExecutionGraphFound(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ExecutionGraphFound) Buffer(org.apache.flink.runtime.io.network.buffer.Buffer) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 2 with WaitForAllVerticesToBeRunning

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning in project flink by apache.

the class JobManagerTest method testStopSignal.

@Test
public void testStopSignal() throws Exception {
    new JavaTestKit(system) {

        {
            new Within(duration("15 seconds")) {

                @Override
                protected void run() {
                    // Setup
                    TestingCluster cluster = null;
                    try {
                        cluster = startTestingCluster(2, 1, DEFAULT_AKKA_ASK_TIMEOUT());
                        // Create a task
                        final JobVertex sender = new JobVertex("Sender");
                        sender.setParallelism(2);
                        sender.setInvokableClass(StoppableInvokable.class);
                        final JobGraph jobGraph = new JobGraph("Stoppable streaming test job", sender);
                        final JobID jid = jobGraph.getJobID();
                        final ActorGateway jobManagerGateway = cluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
                        // we can set the leader session ID to None because we don't use this gateway to send messages
                        final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), null);
                        // Submit the job and wait for all vertices to be running
                        jobManagerGateway.tell(new SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT), testActorGateway);
                        expectMsgClass(JobSubmitSuccess.class);
                        jobManagerGateway.tell(new WaitForAllVerticesToBeRunning(jid), testActorGateway);
                        expectMsgClass(AllVerticesRunning.class);
                        jobManagerGateway.tell(new StopJob(jid), testActorGateway);
                        // - The test ----------------------------------------------------------------------
                        expectMsgClass(StoppingSuccess.class);
                        expectMsgClass(JobResultSuccess.class);
                    } finally {
                        if (cluster != null) {
                            cluster.shutdown();
                        }
                    }
                }
            };
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingUtils.startTestingCluster(org.apache.flink.runtime.testingUtils.TestingUtils.startTestingCluster) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) StopJob(org.apache.flink.runtime.messages.JobManagerMessages.StopJob) JavaTestKit(akka.testkit.JavaTestKit) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with WaitForAllVerticesToBeRunning

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning in project flink by apache.

the class StackTraceSampleCoordinatorITCase method testTaskClearedWhileSampling.

/**
	 * Tests that a cleared task is answered with a partial success response.
	 */
@Test
public void testTaskClearedWhileSampling() throws Exception {
    new JavaTestKit(testActorSystem) {

        {
            final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
            // The JobGraph
            final JobGraph jobGraph = new JobGraph();
            final int parallelism = 1;
            final JobVertex task = new JobVertex("Task");
            task.setInvokableClass(BlockingNoOpInvokable.class);
            task.setParallelism(parallelism);
            jobGraph.addVertex(task);
            ActorGateway jobManger = null;
            ActorGateway taskManager = null;
            try {
                jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
                final Configuration config = new Configuration();
                config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
                taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
                final ActorGateway jm = jobManger;
                new Within(deadline) {

                    @Override
                    protected void run() {
                        try {
                            ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
                            int maxAttempts = 10;
                            int sleepTime = 100;
                            for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
                                // Submit the job and wait until it is running
                                JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
                                jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
                                expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
                                // Get the ExecutionGraph
                                jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
                                ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
                                ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
                                ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
                                StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
                                Future<StackTraceSample> sampleFuture = coordinator.triggerStackTraceSample(vertex.getTaskVertices(), // sampling.
                                21474700 * 100, Time.milliseconds(10L), 0);
                                // Wait before cancelling so that some samples
                                // are actually taken.
                                Thread.sleep(sleepTime);
                                // Cancel job
                                scala.concurrent.Future<?> removeFuture = jm.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), remaining());
                                jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
                                try {
                                    // Throws Exception on failure
                                    sampleFuture.get(remaining().toMillis(), TimeUnit.MILLISECONDS);
                                    // partial result.
                                    break;
                                } catch (Throwable t) {
                                // We were too fast in cancelling the job.
                                // Fall through and retry.
                                } finally {
                                    Await.ready(removeFuture, remaining());
                                }
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(jobManger);
                TestingUtils.stopActor(taskManager);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) AllVerticesRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.AllVerticesRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) ExecutionGraphFound(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ExecutionGraphFound) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 4 with WaitForAllVerticesToBeRunning

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning in project flink by apache.

the class TaskCancelAsyncProducerConsumerITCase method testCancelAsyncProducerAndConsumer.

/**
	 * Tests that a task waiting on an async producer/consumer that is stuck
	 * in a blocking buffer request can be properly cancelled.
	 *
	 * <p>This is currently required for the Flink Kafka sources, which spawn
	 * a separate Thread consuming from Kafka and producing the intermediate
	 * streams in the spawned Thread instead of the main task Thread.
	 */
@Test
public void testCancelAsyncProducerAndConsumer() throws Exception {
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    TestingCluster flink = null;
    try {
        // Cluster
        Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
        config.setInteger(ConfigConstants.TASK_MANAGER_MEMORY_SEGMENT_SIZE_KEY, 4096);
        config.setInteger(ConfigConstants.TASK_MANAGER_NETWORK_NUM_BUFFERS_KEY, 8);
        flink = new TestingCluster(config, true);
        flink.start();
        // Job with async producer and consumer
        JobVertex producer = new JobVertex("AsyncProducer");
        producer.setParallelism(1);
        producer.setInvokableClass(AsyncProducer.class);
        JobVertex consumer = new JobVertex("AsyncConsumer");
        consumer.setParallelism(1);
        consumer.setInvokableClass(AsyncConsumer.class);
        consumer.connectNewDataSetAsInput(producer, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
        SlotSharingGroup slot = new SlotSharingGroup(producer.getID(), consumer.getID());
        producer.setSlotSharingGroup(slot);
        consumer.setSlotSharingGroup(slot);
        JobGraph jobGraph = new JobGraph(producer, consumer);
        // Submit job and wait until running
        ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
        flink.submitJobDetached(jobGraph);
        Object msg = new WaitForAllVerticesToBeRunning(jobGraph.getJobID());
        Future<?> runningFuture = jobManager.ask(msg, deadline.timeLeft());
        Await.ready(runningFuture, deadline.timeLeft());
        // Wait for blocking requests, cancel and wait for cancellation
        msg = new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.CANCELED);
        Future<?> cancelledFuture = jobManager.ask(msg, deadline.timeLeft());
        boolean producerBlocked = false;
        for (int i = 0; i < 50; i++) {
            Thread thread = ASYNC_PRODUCER_THREAD;
            if (thread != null && thread.isAlive()) {
                StackTraceElement[] stackTrace = thread.getStackTrace();
                producerBlocked = isInBlockingBufferRequest(stackTrace);
            }
            if (producerBlocked) {
                break;
            } else {
                // Retry
                Thread.sleep(500);
            }
        }
        // Verify that async producer is in blocking request
        assertTrue("Producer thread is not blocked: " + Arrays.toString(ASYNC_CONSUMER_THREAD.getStackTrace()), producerBlocked);
        boolean consumerWaiting = false;
        for (int i = 0; i < 50; i++) {
            Thread thread = ASYNC_CONSUMER_THREAD;
            if (thread != null && thread.isAlive()) {
                consumerWaiting = thread.getState() == Thread.State.WAITING;
            }
            if (consumerWaiting) {
                break;
            } else {
                // Retry
                Thread.sleep(500);
            }
        }
        // Verify that async consumer is in blocking request
        assertTrue("Consumer thread is not blocked.", consumerWaiting);
        msg = new CancelJob(jobGraph.getJobID());
        Future<?> cancelFuture = jobManager.ask(msg, deadline.timeLeft());
        Await.ready(cancelFuture, deadline.timeLeft());
        Await.ready(cancelledFuture, deadline.timeLeft());
        // Verify the expected Exceptions
        assertNotNull(ASYNC_PRODUCER_EXCEPTION);
        assertEquals(IllegalStateException.class, ASYNC_PRODUCER_EXCEPTION.getClass());
        assertNotNull(ASYNC_CONSUMER_EXCEPTION);
        assertEquals(IllegalStateException.class, ASYNC_CONSUMER_EXCEPTION.getClass());
    } finally {
        if (flink != null) {
            flink.shutdown();
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob) SlotSharingGroup(org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup) NotifyWhenJobStatus(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobStatus) Test(org.junit.Test)

Example 5 with WaitForAllVerticesToBeRunning

use of org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning in project flink by apache.

the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.

/**
	 * Triggers a savepoint for a job that uses the FsStateBackend. We expect
	 * that all checkpoint files are written to a new savepoint directory.
	 *
	 * <ol>
	 * <li>Submit job, wait for some progress</li>
	 * <li>Trigger savepoint and verify that savepoint has been created</li>
	 * <li>Shut down the cluster, re-submit the job from the savepoint,
	 * verify that the initial state has been reset, and
	 * all tasks are running again</li>
	 * <li>Cancel job, dispose the savepoint, and verify that everything
	 * has been cleaned up</li>
	 * </ol>
	 */
@Test
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
    // Config
    final int numTaskManagers = 2;
    final int numSlotsPerTaskManager = 2;
    final int parallelism = numTaskManagers * numSlotsPerTaskManager;
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File testRoot = folder.newFolder();
    TestingCluster flink = null;
    try {
        // Create a test actor system
        ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        final File checkpointDir = new File(testRoot, "checkpoints");
        final File savepointRootDir = new File(testRoot, "savepoints");
        if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
            fail("Test setup failed: failed to create temporary directories.");
        }
        // Use file based checkpoints
        config.setString(CoreOptions.STATE_BACKEND, "filesystem");
        config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
        config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
        // Start Flink
        flink = new TestingCluster(config);
        flink.start(true);
        // Submit the job
        final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
        final JobID jobId = jobGraph.getJobID();
        // Reset the static test job helpers
        StatefulCounter.resetForTest(parallelism);
        // Retrieve the job manager
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");
        flink.submitJobDetached(jobGraph);
        LOG.info("Waiting for some progress.");
        // wait for the JobManager to be ready
        Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
        Await.ready(allRunning, deadline.timeLeft());
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        LOG.info("Triggering a savepoint.");
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
        final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        LOG.info("Retrieved savepoint path: " + savepointPath + ".");
        // Retrieve the savepoint from the testing job manager
        LOG.info("Requesting the savepoint.");
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
        // - Verification START -------------------------------------------
        // Only one savepoint should exist
        File[] files = savepointRootDir.listFiles();
        if (files != null) {
            assertEquals("Savepoint not created in expected directory", 1, files.length);
            assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
            File savepointDir = files[0];
            File[] savepointFiles = savepointDir.listFiles();
            assertNotNull(savepointFiles);
            // Expect one metadata file and one checkpoint file per stateful
            // parallel subtask
            String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
            assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
        } else {
            fail("Savepoint not created in expected directory");
        }
        // We currently have the following directory layout: checkpointDir/jobId/chk-ID
        File jobCheckpoints = new File(checkpointDir, jobId.toString());
        if (jobCheckpoints.exists()) {
            files = jobCheckpoints.listFiles();
            assertNotNull("Checkpoint directory empty", files);
            assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
        }
        // - Verification END ---------------------------------------------
        // Restart the cluster
        LOG.info("Restarting Flink cluster.");
        flink.start();
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        final Throwable[] error = new Throwable[1];
        final TestingCluster finalFlink = flink;
        final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
        new JavaTestKit(testActorSystem) {

            {
                new Within(deadline.timeLeft()) {

                    @Override
                    protected void run() {
                        try {
                            // Register to all submit task messages for job
                            for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
                                taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
                            }
                            // Set the savepoint path
                            jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
                            LOG.info("Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
                            // Submit the job
                            finalFlink.submitJobDetached(jobGraph);
                            int numTasks = 0;
                            for (JobVertex jobVertex : jobGraph.getVertices()) {
                                numTasks += jobVertex.getParallelism();
                            }
                            // Gather the task deployment descriptors
                            LOG.info("Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
                            for (int i = 0; i < numTasks; i++) {
                                ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
                                TaskDeploymentDescriptor tdd = resp.tdd();
                                LOG.info("Received: " + tdd.toString() + ".");
                                TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
                                tdds.put(taskInformation.getJobVertexId(), tdd);
                            }
                        } catch (Throwable t) {
                            error[0] = t;
                        }
                    }
                };
            }
        };
        // - Verification START -------------------------------------------
        String errMsg = "Error during gathering of TaskDeploymentDescriptors";
        assertNull(errMsg, error[0]);
        // have a matching task deployment descriptor.
        for (TaskState taskState : savepoint.getTaskStates()) {
            Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
            errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
            assertTrue(errMsg, taskTdds.size() > 0);
            assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
            for (TaskDeploymentDescriptor tdd : taskTdds) {
                SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
                assertNotNull(subtaskState);
                errMsg = "Initial operator state mismatch.";
                assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
            }
        }
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // - Verification END ---------------------------------------------
        LOG.info("Cancelling job " + jobId + ".");
        jobManager.tell(new CancelJob(jobId));
        LOG.info("Disposing savepoint " + savepointPath + ".");
        Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
        errMsg = "Failed to dispose savepoint " + savepointPath + ".";
        Object resp = Await.result(disposeFuture, deadline.timeLeft());
        assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
        // - Verification START -------------------------------------------
        // The checkpoint files
        List<File> checkpointFiles = new ArrayList<>();
        for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
            for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
                ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
                for (int i = 0; i < streamTaskState.getLength(); i++) {
                    if (streamTaskState.get(i) != null) {
                        FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
                        checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
                    }
                }
            }
        }
        // The checkpoint files of the savepoint should have been discarded
        for (File f : checkpointFiles) {
            errMsg = "Checkpoint file " + f + " not cleaned up properly.";
            assertFalse(errMsg, f.exists());
        }
        if (checkpointFiles.size() > 0) {
            File parent = checkpointFiles.get(0).getParentFile();
            errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
            assertFalse(errMsg, parent.exists());
        }
        // All savepoints should have been cleaned up
        errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
        assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
    // - Verification END ---------------------------------------------
    } finally {
        if (flink != null) {
            flink.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) ResponseSubmitTaskListener(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages.ResponseSubmitTaskListener) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) SavepointV1(org.apache.flink.runtime.checkpoint.savepoint.SavepointV1) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) FileStateHandle(org.apache.flink.runtime.state.filesystem.FileStateHandle) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) TriggerSavepointSuccess(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepointSuccess) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) SubtaskState(org.apache.flink.runtime.checkpoint.SubtaskState) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) File(java.io.File) TaskState(org.apache.flink.runtime.checkpoint.TaskState) JobID(org.apache.flink.api.common.JobID) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Aggregations

ActorGateway (org.apache.flink.runtime.instance.ActorGateway)6 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)6 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)6 WaitForAllVerticesToBeRunning (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning)6 Test (org.junit.Test)6 JavaTestKit (akka.testkit.JavaTestKit)5 Configuration (org.apache.flink.configuration.Configuration)4 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)4 TestingCluster (org.apache.flink.runtime.testingUtils.TestingCluster)4 FiniteDuration (scala.concurrent.duration.FiniteDuration)4 JobID (org.apache.flink.api.common.JobID)3 RequestExecutionGraph (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph)3 ArrayList (java.util.ArrayList)2 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)2 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)2 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)2 CancelJob (org.apache.flink.runtime.messages.JobManagerMessages.CancelJob)2 StopJob (org.apache.flink.runtime.messages.JobManagerMessages.StopJob)2 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)2 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)2