Search in sources :

Example 1 with TriggerStackTraceSample

use of org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample in project flink by apache.

the class StackTraceSampleCoordinatorTest method testTriggerStackTraceSample.

/** Tests simple trigger and collect of stack trace samples. */
@Test
public void testTriggerStackTraceSample() throws Exception {
    ExecutionVertex[] vertices = new ExecutionVertex[] { mockExecutionVertex(new ExecutionAttemptID(), ExecutionState.RUNNING, true), mockExecutionVertex(new ExecutionAttemptID(), ExecutionState.RUNNING, true), mockExecutionVertex(new ExecutionAttemptID(), ExecutionState.RUNNING, true), mockExecutionVertex(new ExecutionAttemptID(), ExecutionState.RUNNING, true) };
    int numSamples = 1;
    Time delayBetweenSamples = Time.milliseconds(100L);
    int maxStackTraceDepth = 0;
    Future<StackTraceSample> sampleFuture = coord.triggerStackTraceSample(vertices, numSamples, delayBetweenSamples, maxStackTraceDepth);
    // Verify messages have been sent
    for (ExecutionVertex vertex : vertices) {
        ExecutionAttemptID expectedExecutionId = vertex.getCurrentExecutionAttempt().getAttemptId();
        TriggerStackTraceSample expectedMsg = new TriggerStackTraceSample(0, expectedExecutionId, numSamples, delayBetweenSamples, maxStackTraceDepth);
        verify(vertex.getCurrentExecutionAttempt()).requestStackTraceSample(eq(0), eq(numSamples), eq(delayBetweenSamples), eq(maxStackTraceDepth), any(Time.class));
    }
    assertFalse(sampleFuture.isDone());
    StackTraceElement[] stackTraceSample = Thread.currentThread().getStackTrace();
    List<StackTraceElement[]> traces = new ArrayList<>();
    traces.add(stackTraceSample);
    traces.add(stackTraceSample);
    traces.add(stackTraceSample);
    // Collect stack traces
    for (int i = 0; i < vertices.length; i++) {
        ExecutionAttemptID executionId = vertices[i].getCurrentExecutionAttempt().getAttemptId();
        coord.collectStackTraces(0, executionId, traces);
        if (i == vertices.length - 1) {
            assertTrue(sampleFuture.isDone());
        } else {
            assertFalse(sampleFuture.isDone());
        }
    }
    // Verify completed stack trace sample
    StackTraceSample sample = sampleFuture.get();
    assertEquals(0, sample.getSampleId());
    assertTrue(sample.getEndTime() >= sample.getStartTime());
    Map<ExecutionAttemptID, List<StackTraceElement[]>> tracesByTask = sample.getStackTraces();
    for (ExecutionVertex vertex : vertices) {
        ExecutionAttemptID executionId = vertex.getCurrentExecutionAttempt().getAttemptId();
        List<StackTraceElement[]> sampleTraces = tracesByTask.get(executionId);
        assertNotNull("Task not found", sampleTraces);
        assertTrue(traces.equals(sampleTraces));
    }
    // Verify no more pending sample
    assertEquals(0, coord.getNumberOfPendingSamples());
    // Verify no error on late collect
    coord.collectStackTraces(0, vertices[0].getCurrentExecutionAttempt().getAttemptId(), traces);
}
Also used : TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.Test)

Example 2 with TriggerStackTraceSample

use of org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample in project flink by apache.

the class TaskManagerTest method testTriggerStackTraceSampleMessage.

// ------------------------------------------------------------------------
// Stack trace sample
// ------------------------------------------------------------------------
/**
	 * Tests sampling of task stack traces.
	 */
@Test
@SuppressWarnings("unchecked")
public void testTriggerStackTraceSampleMessage() throws Exception {
    new JavaTestKit(system) {

        {
            ActorGateway taskManagerActorGateway = null;
            // We need this to be a JM that answers to update messages for
            // robustness on Travis (if jobs need to be resubmitted in (4)).
            ActorRef jm = system.actorOf(Props.create(new SimpleLookupJobManagerCreator(null)));
            ActorGateway jobManagerActorGateway = new AkkaActorGateway(jm, null);
            final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
            try {
                final ActorGateway jobManager = jobManagerActorGateway;
                final ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, false);
                final JobID jobId = new JobID();
                // Single blocking task
                final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jobId, "Job", new JobVertexID(), new ExecutionAttemptID(), new SerializedValue<>(new ExecutionConfig()), "Task", 1, 0, 1, 0, new Configuration(), new Configuration(), BlockingNoOpInvokable.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
                // Submit the task
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            // Make sure to register
                            Future<?> connectFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor()), remaining());
                            Await.ready(connectFuture, remaining());
                            Future<Object> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
                            taskManager.tell(new SubmitTask(tdd));
                            Await.ready(taskRunningFuture, d);
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 1) Trigger sample for non-existing task
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            ExecutionAttemptID taskId = new ExecutionAttemptID();
                            taskManager.tell(new TriggerStackTraceSample(112223, taskId, 100, timeD, 0), testActorGateway);
                            // Receive the expected message (heartbeat races possible)
                            Object[] msg = receiveN(1);
                            while (!(msg[0] instanceof Status.Failure)) {
                                msg = receiveN(1);
                            }
                            Status.Failure response = (Status.Failure) msg[0];
                            assertEquals(IllegalStateException.class, response.cause().getClass());
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 2) Trigger sample for the blocking task
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        boolean success = false;
                        Throwable lastError = null;
                        for (int i = 0; i < 100 && !success; i++) {
                            try {
                                int numSamples = 5;
                                taskManager.tell(new TriggerStackTraceSample(19230, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), 0), testActorGateway);
                                // Receive the expected message (heartbeat races possible)
                                Object[] msg = receiveN(1);
                                while (!(msg[0] instanceof StackTraceSampleResponse)) {
                                    msg = receiveN(1);
                                }
                                StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                                // ---- Verify response ----
                                assertEquals(19230, response.getSampleId());
                                assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                                List<StackTraceElement[]> traces = response.getSamples();
                                assertEquals("Number of samples", numSamples, traces.size());
                                for (StackTraceElement[] trace : traces) {
                                    // Look for BlockingNoOpInvokable#invoke
                                    for (StackTraceElement elem : trace) {
                                        if (elem.getClassName().equals(BlockingNoOpInvokable.class.getName())) {
                                            assertEquals("invoke", elem.getMethodName());
                                            success = true;
                                            break;
                                        }
                                    }
                                    assertTrue("Unexpected stack trace: " + Arrays.toString(trace), success);
                                }
                            } catch (Throwable t) {
                                lastError = t;
                                LOG.warn("Failed to find invokable.", t);
                            }
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                LOG.error("Interrupted while sleeping before retry.", e);
                                break;
                            }
                        }
                        if (!success) {
                            if (lastError == null) {
                                fail("Failed to find invokable");
                            } else {
                                fail(lastError.getMessage());
                            }
                        }
                    }
                };
                //
                // 3) Trigger sample for the blocking task with max depth
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            int numSamples = 5;
                            int maxDepth = 2;
                            taskManager.tell(new TriggerStackTraceSample(1337, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), maxDepth), testActorGateway);
                            // Receive the expected message (heartbeat races possible)
                            Object[] msg = receiveN(1);
                            while (!(msg[0] instanceof StackTraceSampleResponse)) {
                                msg = receiveN(1);
                            }
                            StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                            // ---- Verify response ----
                            assertEquals(1337, response.getSampleId());
                            assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                            List<StackTraceElement[]> traces = response.getSamples();
                            assertEquals("Number of samples", numSamples, traces.size());
                            for (StackTraceElement[] trace : traces) {
                                assertEquals("Max depth", maxDepth, trace.length);
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 4) Trigger sample for the blocking task, but cancel it during sampling
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            int maxAttempts = 10;
                            int sleepTime = 100;
                            for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
                                // Trigger many samples in order to cancel the task
                                // during a sample
                                taskManager.tell(new TriggerStackTraceSample(44, tdd.getExecutionAttemptId(), Integer.MAX_VALUE, Time.milliseconds(10L), 0), testActorGateway);
                                Thread.sleep(sleepTime);
                                Future<?> removeFuture = taskManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobId), remaining());
                                // Cancel the task
                                taskManager.tell(new CancelTask(tdd.getExecutionAttemptId()));
                                // Receive the expected message (heartbeat races possible)
                                while (true) {
                                    Object[] msg = receiveN(1);
                                    if (msg[0] instanceof StackTraceSampleResponse) {
                                        StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                                        assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                                        assertEquals(44, response.getSampleId());
                                        // Done
                                        return;
                                    } else if (msg[0] instanceof Failure) {
                                        // Wait for removal before resubmitting
                                        Await.ready(removeFuture, remaining());
                                        Future<?> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
                                        // Resubmit
                                        taskManager.tell(new SubmitTask(tdd));
                                        Await.ready(taskRunningFuture, remaining());
                                        // Retry the sample message
                                        break;
                                    } else {
                                        // Different message
                                        continue;
                                    }
                                }
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(taskManagerActorGateway);
                TestingUtils.stopActor(jobManagerActorGateway);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) SubmitTask(org.apache.flink.runtime.messages.TaskMessages.SubmitTask) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse) CancelTask(org.apache.flink.runtime.messages.TaskMessages.CancelTask) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) Failure(scala.util.Failure) Status(akka.actor.Status) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) PartitionNotFoundException(org.apache.flink.runtime.io.network.partition.PartitionNotFoundException) IOException(java.io.IOException) BlockingNoOpInvokable(org.apache.flink.runtime.testtasks.BlockingNoOpInvokable) CompletableFuture(org.apache.flink.runtime.concurrent.CompletableFuture) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Future(scala.concurrent.Future) JavaTestKit(akka.testkit.JavaTestKit) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with TriggerStackTraceSample

use of org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample in project flink by apache.

the class TaskManagerTest method testStackTraceSampleFailure.

/**
	 * Tests that the TaskManager sends a proper exception back to the sender if the trigger stack
	 * trace message fails.
	 */
@Test
public void testStackTraceSampleFailure() throws Exception {
    ActorGateway jobManager = null;
    ActorGateway taskManager = null;
    try {
        ActorRef jm = system.actorOf(Props.create(SimpleJobManager.class, leaderSessionID));
        jobManager = new AkkaActorGateway(jm, leaderSessionID);
        taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, true);
        Future<Object> stackTraceResponse = taskManager.ask(new TriggerStackTraceSample(0, new ExecutionAttemptID(), 0, Time.milliseconds(1L), 0), timeout);
        try {
            Await.result(stackTraceResponse, timeout);
            fail("The trigger stack trace message should have failed.");
        } catch (IllegalStateException e) {
        // expected
        }
    } finally {
        TestingUtils.stopActor(jobManager);
        TestingUtils.stopActor(taskManager);
    }
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Test(org.junit.Test)

Aggregations

ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)3 TriggerStackTraceSample (org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample)3 Test (org.junit.Test)3 ActorRef (akka.actor.ActorRef)2 Configuration (org.apache.flink.configuration.Configuration)2 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)2 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)2 TaskManagerServicesConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration)2 Status (akka.actor.Status)1 JavaTestKit (akka.testkit.JavaTestKit)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)1 JobID (org.apache.flink.api.common.JobID)1 Time (org.apache.flink.api.common.time.Time)1 CompletableFuture (org.apache.flink.runtime.concurrent.CompletableFuture)1 FlinkCompletableFuture (org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture)1 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)1 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)1