Search in sources :

Example 66 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class StackTraceSampleCoordinatorITCase method testTaskClearedWhileSampling.

/**
	 * Tests that a cleared task is answered with a partial success response.
	 */
@Test
public void testTaskClearedWhileSampling() throws Exception {
    new JavaTestKit(testActorSystem) {

        {
            final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
            // The JobGraph
            final JobGraph jobGraph = new JobGraph();
            final int parallelism = 1;
            final JobVertex task = new JobVertex("Task");
            task.setInvokableClass(BlockingNoOpInvokable.class);
            task.setParallelism(parallelism);
            jobGraph.addVertex(task);
            ActorGateway jobManger = null;
            ActorGateway taskManager = null;
            try {
                jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
                final Configuration config = new Configuration();
                config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
                taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
                final ActorGateway jm = jobManger;
                new Within(deadline) {

                    @Override
                    protected void run() {
                        try {
                            ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
                            int maxAttempts = 10;
                            int sleepTime = 100;
                            for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
                                // Submit the job and wait until it is running
                                JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
                                jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
                                expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
                                // Get the ExecutionGraph
                                jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
                                ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
                                ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
                                ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
                                StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
                                Future<StackTraceSample> sampleFuture = coordinator.triggerStackTraceSample(vertex.getTaskVertices(), // sampling.
                                21474700 * 100, Time.milliseconds(10L), 0);
                                // Wait before cancelling so that some samples
                                // are actually taken.
                                Thread.sleep(sleepTime);
                                // Cancel job
                                scala.concurrent.Future<?> removeFuture = jm.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), remaining());
                                jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
                                try {
                                    // Throws Exception on failure
                                    sampleFuture.get(remaining().toMillis(), TimeUnit.MILLISECONDS);
                                    // partial result.
                                    break;
                                } catch (Throwable t) {
                                // We were too fast in cancelling the job.
                                // Fall through and retry.
                                } finally {
                                    Await.ready(removeFuture, remaining());
                                }
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(jobManger);
                TestingUtils.stopActor(taskManager);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) AllVerticesRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.AllVerticesRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) ExecutionGraphFound(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ExecutionGraphFound) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 67 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class JobCancellationWithSavepointHandlersTest method testTriggerNewRequest.

/**
	 * Tests triggering a new request and monitoring it.
	 */
@Test
public void testTriggerNewRequest() throws Exception {
    JobID jobId = new JobID();
    ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
    ExecutionGraph graph = mock(ExecutionGraph.class);
    CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
    when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
    when(graph.getCheckpointCoordinator()).thenReturn(coord);
    JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
    JobCancellationWithSavepointHandlers.TriggerHandler trigger = handlers.getTriggerHandler();
    JobCancellationWithSavepointHandlers.InProgressHandler progress = handlers.getInProgressHandler();
    Map<String, String> params = new HashMap<>();
    params.put("jobid", jobId.toString());
    params.put("targetDirectory", "custom-directory");
    ActorGateway jobManager = mock(ActorGateway.class);
    // Successful
    Promise<Object> promise = new Promise.DefaultPromise<>();
    when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(promise);
    // Trigger
    FullHttpResponse response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
    String location = String.format("/jobs/%s/cancel-with-savepoint/in-progress/1", jobId);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
    String json = response.content().toString(Charset.forName("UTF-8"));
    JsonNode root = new ObjectMapper().readTree(json);
    assertEquals("accepted", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals(location, root.get("location").getValueAsText());
    // Trigger again
    response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("accepted", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals(location, root.get("location").getValueAsText());
    // Only single actual request
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
    // Query progress
    params.put("requestId", "1");
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("in-progress", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    // Complete
    promise.success(new CancellationSuccess(jobId, "_path-savepoint_"));
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.CREATED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("success", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
    // Query again, keep recent history
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.CREATED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("success", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
    // Query for unknown request
    params.put("requestId", "9929");
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.BAD_REQUEST, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("failed", root.get("status").getValueAsText());
    assertEquals("9929", root.get("request-id").getValueAsText());
    assertEquals("Unknown job/request ID", root.get("cause").getValueAsText());
}
Also used : HashMap(java.util.HashMap) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) JsonNode(org.codehaus.jackson.JsonNode) ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) FullHttpResponse(io.netty.handler.codec.http.FullHttpResponse) JobID(org.apache.flink.api.common.JobID) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) Test(org.junit.Test)

Example 68 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class CoordinatorShutdownTest method testCoordinatorShutsDownOnFailure.

@Test
public void testCoordinatorShutsDownOnFailure() {
    LocalFlinkMiniCluster cluster = null;
    try {
        Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
        cluster = new LocalFlinkMiniCluster(config, true);
        cluster.start();
        // build a test graph with snapshotting enabled
        JobVertex vertex = new JobVertex("Test Vertex");
        vertex.setInvokableClass(FailingBlockingInvokable.class);
        List<JobVertexID> vertexIdList = Collections.singletonList(vertex.getID());
        JobGraph testGraph = new JobGraph("test job", vertex);
        testGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexIdList, vertexIdList, vertexIdList, 5000, 60000, 0L, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true));
        ActorGateway jmGateway = cluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
        FiniteDuration timeout = new FiniteDuration(60, TimeUnit.SECONDS);
        JobManagerMessages.SubmitJob submitMessage = new JobManagerMessages.SubmitJob(testGraph, ListeningBehaviour.EXECUTION_RESULT);
        // submit is successful, but then the job blocks due to the invokable
        Future<Object> submitFuture = jmGateway.ask(submitMessage, timeout);
        Await.result(submitFuture, timeout);
        // get the execution graph and store the ExecutionGraph reference
        Future<Object> jobRequestFuture = jmGateway.ask(new JobManagerMessages.RequestJob(testGraph.getJobID()), timeout);
        ExecutionGraph graph = (ExecutionGraph) ((JobManagerMessages.JobFound) Await.result(jobRequestFuture, timeout)).executionGraph();
        assertNotNull(graph);
        FailingBlockingInvokable.unblock();
        graph.waitUntilFinished();
        // verify that the coordinator was shut down
        CheckpointCoordinator coord = graph.getCheckpointCoordinator();
        assertTrue(coord == null || coord.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (cluster != null) {
            cluster.shutdown();
            cluster.awaitTermination();
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) LocalFlinkMiniCluster(org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 69 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class ExecutionGraphCheckpointCoordinatorTest method testSuspendCheckpointCoordinator.

/**
	 * Tests that a suspended checkpoint coordinator calls suspend on
	 * the store and counter.
	 */
@Test
public void testSuspendCheckpointCoordinator() throws Exception {
    CheckpointIDCounter counter = mock(CheckpointIDCounter.class);
    CompletedCheckpointStore store = mock(CompletedCheckpointStore.class);
    ExecutionGraph graph = createExecutionGraphAndEnableCheckpointing(counter, store);
    graph.suspend(new Exception("Test Exception"));
    // No shutdown
    verify(counter, times(1)).shutdown(Matchers.eq(JobStatus.SUSPENDED));
    verify(store, times(1)).shutdown(Matchers.eq(JobStatus.SUSPENDED));
}
Also used : ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 70 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class LeaderChangeJobRecoveryTest method testNotRestartedWhenLosingLeadership.

/**
	 * Tests that the job is not restarted or at least terminates eventually in case that the
	 * JobManager loses its leadership.
	 *
	 * @throws Exception
	 */
@Test
public void testNotRestartedWhenLosingLeadership() throws Exception {
    UUID leaderSessionID = UUID.randomUUID();
    cluster.grantLeadership(0, leaderSessionID);
    cluster.notifyRetrievalListeners(0, leaderSessionID);
    cluster.waitForTaskManagersToBeRegistered(timeout);
    cluster.submitJobDetached(job);
    ActorGateway jm = cluster.getLeaderGateway(timeout);
    Future<Object> wait = jm.ask(new TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
    Await.ready(wait, timeout);
    Future<Object> futureExecutionGraph = jm.ask(new TestingJobManagerMessages.RequestExecutionGraph(job.getJobID()), timeout);
    TestingJobManagerMessages.ResponseExecutionGraph responseExecutionGraph = (TestingJobManagerMessages.ResponseExecutionGraph) Await.result(futureExecutionGraph, timeout);
    assertTrue(responseExecutionGraph instanceof TestingJobManagerMessages.ExecutionGraphFound);
    ExecutionGraph executionGraph = (ExecutionGraph) ((TestingJobManagerMessages.ExecutionGraphFound) responseExecutionGraph).executionGraph();
    TerminalJobStatusListener testListener = new TerminalJobStatusListener();
    executionGraph.registerJobStatusListener(testListener);
    cluster.revokeLeadership();
    testListener.waitForTerminalState(30000);
}
Also used : TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) TerminalJobStatusListener(org.apache.flink.runtime.executiongraph.TerminalJobStatusListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) UUID(java.util.UUID) Test(org.junit.Test)

Aggregations

ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)120 Test (org.junit.Test)96 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)77 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)53 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)40 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)36 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)35 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)31 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)24 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)24 HashMap (java.util.HashMap)20 CompletableFuture (java.util.concurrent.CompletableFuture)19 JobID (org.apache.flink.api.common.JobID)19 ArrayList (java.util.ArrayList)17 HashSet (java.util.HashSet)17 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)17 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)17 ExecutionException (java.util.concurrent.ExecutionException)13 Executor (java.util.concurrent.Executor)13 IOException (java.io.IOException)12