Search in sources :

Example 1 with StackTraceSampleResponse

use of org.apache.flink.runtime.messages.StackTraceSampleResponse in project flink by apache.

the class TaskManagerTest method testTriggerStackTraceSampleMessage.

// ------------------------------------------------------------------------
// Stack trace sample
// ------------------------------------------------------------------------
/**
	 * Tests sampling of task stack traces.
	 */
@Test
@SuppressWarnings("unchecked")
public void testTriggerStackTraceSampleMessage() throws Exception {
    new JavaTestKit(system) {

        {
            ActorGateway taskManagerActorGateway = null;
            // We need this to be a JM that answers to update messages for
            // robustness on Travis (if jobs need to be resubmitted in (4)).
            ActorRef jm = system.actorOf(Props.create(new SimpleLookupJobManagerCreator(null)));
            ActorGateway jobManagerActorGateway = new AkkaActorGateway(jm, null);
            final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
            try {
                final ActorGateway jobManager = jobManagerActorGateway;
                final ActorGateway taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, false);
                final JobID jobId = new JobID();
                // Single blocking task
                final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jobId, "Job", new JobVertexID(), new ExecutionAttemptID(), new SerializedValue<>(new ExecutionConfig()), "Task", 1, 0, 1, 0, new Configuration(), new Configuration(), BlockingNoOpInvokable.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
                // Submit the task
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            // Make sure to register
                            Future<?> connectFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor()), remaining());
                            Await.ready(connectFuture, remaining());
                            Future<Object> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
                            taskManager.tell(new SubmitTask(tdd));
                            Await.ready(taskRunningFuture, d);
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 1) Trigger sample for non-existing task
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            ExecutionAttemptID taskId = new ExecutionAttemptID();
                            taskManager.tell(new TriggerStackTraceSample(112223, taskId, 100, timeD, 0), testActorGateway);
                            // Receive the expected message (heartbeat races possible)
                            Object[] msg = receiveN(1);
                            while (!(msg[0] instanceof Status.Failure)) {
                                msg = receiveN(1);
                            }
                            Status.Failure response = (Status.Failure) msg[0];
                            assertEquals(IllegalStateException.class, response.cause().getClass());
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 2) Trigger sample for the blocking task
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        boolean success = false;
                        Throwable lastError = null;
                        for (int i = 0; i < 100 && !success; i++) {
                            try {
                                int numSamples = 5;
                                taskManager.tell(new TriggerStackTraceSample(19230, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), 0), testActorGateway);
                                // Receive the expected message (heartbeat races possible)
                                Object[] msg = receiveN(1);
                                while (!(msg[0] instanceof StackTraceSampleResponse)) {
                                    msg = receiveN(1);
                                }
                                StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                                // ---- Verify response ----
                                assertEquals(19230, response.getSampleId());
                                assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                                List<StackTraceElement[]> traces = response.getSamples();
                                assertEquals("Number of samples", numSamples, traces.size());
                                for (StackTraceElement[] trace : traces) {
                                    // Look for BlockingNoOpInvokable#invoke
                                    for (StackTraceElement elem : trace) {
                                        if (elem.getClassName().equals(BlockingNoOpInvokable.class.getName())) {
                                            assertEquals("invoke", elem.getMethodName());
                                            success = true;
                                            break;
                                        }
                                    }
                                    assertTrue("Unexpected stack trace: " + Arrays.toString(trace), success);
                                }
                            } catch (Throwable t) {
                                lastError = t;
                                LOG.warn("Failed to find invokable.", t);
                            }
                            try {
                                Thread.sleep(100);
                            } catch (InterruptedException e) {
                                LOG.error("Interrupted while sleeping before retry.", e);
                                break;
                            }
                        }
                        if (!success) {
                            if (lastError == null) {
                                fail("Failed to find invokable");
                            } else {
                                fail(lastError.getMessage());
                            }
                        }
                    }
                };
                //
                // 3) Trigger sample for the blocking task with max depth
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            int numSamples = 5;
                            int maxDepth = 2;
                            taskManager.tell(new TriggerStackTraceSample(1337, tdd.getExecutionAttemptId(), numSamples, Time.milliseconds(100L), maxDepth), testActorGateway);
                            // Receive the expected message (heartbeat races possible)
                            Object[] msg = receiveN(1);
                            while (!(msg[0] instanceof StackTraceSampleResponse)) {
                                msg = receiveN(1);
                            }
                            StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                            // ---- Verify response ----
                            assertEquals(1337, response.getSampleId());
                            assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                            List<StackTraceElement[]> traces = response.getSamples();
                            assertEquals("Number of samples", numSamples, traces.size());
                            for (StackTraceElement[] trace : traces) {
                                assertEquals("Max depth", maxDepth, trace.length);
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
                //
                // 4) Trigger sample for the blocking task, but cancel it during sampling
                //
                new Within(d) {

                    @Override
                    protected void run() {
                        try {
                            int maxAttempts = 10;
                            int sleepTime = 100;
                            for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
                                // Trigger many samples in order to cancel the task
                                // during a sample
                                taskManager.tell(new TriggerStackTraceSample(44, tdd.getExecutionAttemptId(), Integer.MAX_VALUE, Time.milliseconds(10L), 0), testActorGateway);
                                Thread.sleep(sleepTime);
                                Future<?> removeFuture = taskManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobId), remaining());
                                // Cancel the task
                                taskManager.tell(new CancelTask(tdd.getExecutionAttemptId()));
                                // Receive the expected message (heartbeat races possible)
                                while (true) {
                                    Object[] msg = receiveN(1);
                                    if (msg[0] instanceof StackTraceSampleResponse) {
                                        StackTraceSampleResponse response = (StackTraceSampleResponse) msg[0];
                                        assertEquals(tdd.getExecutionAttemptId(), response.getExecutionAttemptID());
                                        assertEquals(44, response.getSampleId());
                                        // Done
                                        return;
                                    } else if (msg[0] instanceof Failure) {
                                        // Wait for removal before resubmitting
                                        Await.ready(removeFuture, remaining());
                                        Future<?> taskRunningFuture = taskManager.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(tdd.getExecutionAttemptId()), timeout);
                                        // Resubmit
                                        taskManager.tell(new SubmitTask(tdd));
                                        Await.ready(taskRunningFuture, remaining());
                                        // Retry the sample message
                                        break;
                                    } else {
                                        // Different message
                                        continue;
                                    }
                                }
                            }
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(taskManagerActorGateway);
                TestingUtils.stopActor(jobManagerActorGateway);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) SubmitTask(org.apache.flink.runtime.messages.TaskMessages.SubmitTask) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse) CancelTask(org.apache.flink.runtime.messages.TaskMessages.CancelTask) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) Failure(scala.util.Failure) Status(akka.actor.Status) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) PartitionNotFoundException(org.apache.flink.runtime.io.network.partition.PartitionNotFoundException) IOException(java.io.IOException) BlockingNoOpInvokable(org.apache.flink.runtime.testtasks.BlockingNoOpInvokable) CompletableFuture(org.apache.flink.runtime.concurrent.CompletableFuture) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Future(scala.concurrent.Future) JavaTestKit(akka.testkit.JavaTestKit) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 2 with StackTraceSampleResponse

use of org.apache.flink.runtime.messages.StackTraceSampleResponse in project flink by apache.

the class StackTraceSampleCoordinator method triggerStackTraceSample.

/**
	 * Triggers a stack trace sample to all tasks.
	 *
	 * @param tasksToSample       Tasks to sample.
	 * @param numSamples          Number of stack trace samples to collect.
	 * @param delayBetweenSamples Delay between consecutive samples.
	 * @param maxStackTraceDepth  Maximum depth of the stack trace. 0 indicates
	 *                            no maximum and keeps the complete stack trace.
	 * @return A future of the completed stack trace sample
	 */
@SuppressWarnings("unchecked")
public Future<StackTraceSample> triggerStackTraceSample(ExecutionVertex[] tasksToSample, int numSamples, Time delayBetweenSamples, int maxStackTraceDepth) {
    checkNotNull(tasksToSample, "Tasks to sample");
    checkArgument(tasksToSample.length >= 1, "No tasks to sample");
    checkArgument(numSamples >= 1, "No number of samples");
    checkArgument(maxStackTraceDepth >= 0, "Negative maximum stack trace depth");
    // Execution IDs of running tasks
    ExecutionAttemptID[] triggerIds = new ExecutionAttemptID[tasksToSample.length];
    Execution[] executions = new Execution[tasksToSample.length];
    // triggering can still fail.
    for (int i = 0; i < triggerIds.length; i++) {
        Execution execution = tasksToSample[i].getCurrentExecutionAttempt();
        if (execution != null && execution.getState() == ExecutionState.RUNNING) {
            executions[i] = execution;
            triggerIds[i] = execution.getAttemptId();
        } else {
            return FlinkCompletableFuture.completedExceptionally(new IllegalStateException("Task " + tasksToSample[i].getTaskNameWithSubtaskIndex() + " is not running."));
        }
    }
    synchronized (lock) {
        if (isShutDown) {
            return FlinkCompletableFuture.completedExceptionally(new IllegalStateException("Shut down"));
        }
        final int sampleId = sampleIdCounter++;
        LOG.debug("Triggering stack trace sample {}", sampleId);
        final PendingStackTraceSample pending = new PendingStackTraceSample(sampleId, triggerIds);
        // Discard the sample if it takes too long. We don't send cancel
        // messages to the task managers, but only wait for the responses
        // and then ignore them.
        long expectedDuration = numSamples * delayBetweenSamples.toMilliseconds();
        Time timeout = Time.milliseconds(expectedDuration + sampleTimeout);
        // Add the pending sample before scheduling the discard task to
        // prevent races with removing it again.
        pendingSamples.put(sampleId, pending);
        // Trigger all samples
        for (Execution execution : executions) {
            final Future<StackTraceSampleResponse> stackTraceSampleFuture = execution.requestStackTraceSample(sampleId, numSamples, delayBetweenSamples, maxStackTraceDepth, timeout);
            stackTraceSampleFuture.handleAsync(new BiFunction<StackTraceSampleResponse, Throwable, Void>() {

                @Override
                public Void apply(StackTraceSampleResponse stackTraceSampleResponse, Throwable throwable) {
                    if (stackTraceSampleResponse != null) {
                        collectStackTraces(stackTraceSampleResponse.getSampleId(), stackTraceSampleResponse.getExecutionAttemptID(), stackTraceSampleResponse.getSamples());
                    } else {
                        cancelStackTraceSample(sampleId, throwable);
                    }
                    return null;
                }
            }, executor);
        }
        return pending.getStackTraceSampleFuture();
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) Time(org.apache.flink.api.common.time.Time) Execution(org.apache.flink.runtime.executiongraph.Execution) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse)

Example 3 with StackTraceSampleResponse

use of org.apache.flink.runtime.messages.StackTraceSampleResponse in project flink by apache.

the class StackTraceSampleCoordinatorTest method mockExecutionVertex.

// ------------------------------------------------------------------------
private ExecutionVertex mockExecutionVertex(ExecutionAttemptID executionId, ExecutionState state, boolean sendSuccess) {
    Execution exec = mock(Execution.class);
    when(exec.getAttemptId()).thenReturn(executionId);
    when(exec.getState()).thenReturn(state);
    when(exec.requestStackTraceSample(anyInt(), anyInt(), any(Time.class), anyInt(), any(Time.class))).thenReturn(sendSuccess ? FlinkCompletableFuture.completed(mock(StackTraceSampleResponse.class)) : FlinkCompletableFuture.<StackTraceSampleResponse>completedExceptionally(new Exception("Send failed")));
    ExecutionVertex vertex = mock(ExecutionVertex.class);
    when(vertex.getJobvertexId()).thenReturn(new JobVertexID());
    when(vertex.getCurrentExecutionAttempt()).thenReturn(exec);
    return vertex;
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Time(org.apache.flink.api.common.time.Time) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex)

Example 4 with StackTraceSampleResponse

use of org.apache.flink.runtime.messages.StackTraceSampleResponse in project flink by apache.

the class StackTraceSampleCoordinatorTest method mockExecutionVertexWithTimeout.

private ExecutionVertex mockExecutionVertexWithTimeout(ExecutionAttemptID executionId, ExecutionState state, ScheduledExecutorService scheduledExecutorService, int timeout) {
    final CompletableFuture<StackTraceSampleResponse> future = new FlinkCompletableFuture<>();
    Execution exec = mock(Execution.class);
    when(exec.getAttemptId()).thenReturn(executionId);
    when(exec.getState()).thenReturn(state);
    when(exec.requestStackTraceSample(anyInt(), anyInt(), any(Time.class), anyInt(), any(Time.class))).thenReturn(future);
    scheduledExecutorService.schedule(new Runnable() {

        @Override
        public void run() {
            future.completeExceptionally(new TimeoutException("Timeout"));
        }
    }, timeout, TimeUnit.MILLISECONDS);
    ExecutionVertex vertex = mock(ExecutionVertex.class);
    when(vertex.getJobvertexId()).thenReturn(new JobVertexID());
    when(vertex.getCurrentExecutionAttempt()).thenReturn(exec);
    return vertex;
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Time(org.apache.flink.api.common.time.Time) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

StackTraceSampleResponse (org.apache.flink.runtime.messages.StackTraceSampleResponse)4 Time (org.apache.flink.api.common.time.Time)3 Execution (org.apache.flink.runtime.executiongraph.Execution)3 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)3 TimeoutException (java.util.concurrent.TimeoutException)2 FlinkCompletableFuture (org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture)2 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)2 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)2 ActorRef (akka.actor.ActorRef)1 Status (akka.actor.Status)1 JavaTestKit (akka.testkit.JavaTestKit)1 IOException (java.io.IOException)1 ExecutionException (java.util.concurrent.ExecutionException)1 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)1 JobID (org.apache.flink.api.common.JobID)1 Configuration (org.apache.flink.configuration.Configuration)1 CompletableFuture (org.apache.flink.runtime.concurrent.CompletableFuture)1 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)1 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)1 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)1