Search in sources :

Example 46 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class StackTraceSampleCoordinatorTest method testCollectForDiscardedPendingSample.

/** Tests that collecting for a cancelled sample throws no Exception. */
@Test
public void testCollectForDiscardedPendingSample() throws Exception {
    ExecutionVertex[] vertices = new ExecutionVertex[] { mockExecutionVertex(new ExecutionAttemptID(), ExecutionState.RUNNING, true) };
    Future<StackTraceSample> sampleFuture = coord.triggerStackTraceSample(vertices, 1, Time.milliseconds(100L), 0);
    assertFalse(sampleFuture.isDone());
    coord.cancelStackTraceSample(0, null);
    assertTrue(sampleFuture.isDone());
    // Verify no error on late collect
    ExecutionAttemptID executionId = vertices[0].getCurrentExecutionAttempt().getAttemptId();
    coord.collectStackTraces(0, executionId, new ArrayList<StackTraceElement[]>());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Test(org.junit.Test)

Example 47 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class StackTraceSampleCoordinatorTest method mockExecutionVertexWithTimeout.

private ExecutionVertex mockExecutionVertexWithTimeout(ExecutionAttemptID executionId, ExecutionState state, ScheduledExecutorService scheduledExecutorService, int timeout) {
    final CompletableFuture<StackTraceSampleResponse> future = new FlinkCompletableFuture<>();
    Execution exec = mock(Execution.class);
    when(exec.getAttemptId()).thenReturn(executionId);
    when(exec.getState()).thenReturn(state);
    when(exec.requestStackTraceSample(anyInt(), anyInt(), any(Time.class), anyInt(), any(Time.class))).thenReturn(future);
    scheduledExecutorService.schedule(new Runnable() {

        @Override
        public void run() {
            future.completeExceptionally(new TimeoutException("Timeout"));
        }
    }, timeout, TimeUnit.MILLISECONDS);
    ExecutionVertex vertex = mock(ExecutionVertex.class);
    when(vertex.getJobvertexId()).thenReturn(new JobVertexID());
    when(vertex.getCurrentExecutionAttempt()).thenReturn(exec);
    return vertex;
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) Time(org.apache.flink.api.common.time.Time) StackTraceSampleResponse(org.apache.flink.runtime.messages.StackTraceSampleResponse) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) TimeoutException(java.util.concurrent.TimeoutException)

Example 48 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinator method completePendingCheckpoint.

/**
	 * Try to complete the given pending checkpoint.
	 *
	 * Important: This method should only be called in the checkpoint lock scope.
	 *
	 * @param pendingCheckpoint to complete
	 * @throws CheckpointException if the completion failed
	 */
private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) throws CheckpointException {
    final long checkpointId = pendingCheckpoint.getCheckpointId();
    CompletedCheckpoint completedCheckpoint = null;
    try {
        // externalize the checkpoint if required
        if (pendingCheckpoint.getProps().externalizeCheckpoint()) {
            completedCheckpoint = pendingCheckpoint.finalizeCheckpointExternalized();
        } else {
            completedCheckpoint = pendingCheckpoint.finalizeCheckpointNonExternalized();
        }
        completedCheckpointStore.addCheckpoint(completedCheckpoint);
        rememberRecentCheckpointId(checkpointId);
        dropSubsumedCheckpoints(checkpointId);
    } catch (Exception exception) {
        // abort the current pending checkpoint if it has not been discarded yet
        if (!pendingCheckpoint.isDiscarded()) {
            pendingCheckpoint.abortError(exception);
        }
        if (completedCheckpoint != null) {
            // we failed to store the completed checkpoint. Let's clean up
            final CompletedCheckpoint cc = completedCheckpoint;
            executor.execute(new Runnable() {

                @Override
                public void run() {
                    try {
                        cc.discard();
                    } catch (Throwable t) {
                        LOG.warn("Could not properly discard completed checkpoint {}.", cc.getCheckpointID(), t);
                    }
                }
            });
        }
        throw new CheckpointException("Could not complete the pending checkpoint " + checkpointId + '.', exception);
    } finally {
        pendingCheckpoints.remove(checkpointId);
        triggerQueuedRequests();
    }
    // record the time when this was completed, to calculate
    // the 'min delay between checkpoints'
    lastCheckpointCompletionNanos = System.nanoTime();
    LOG.info("Completed checkpoint {} ({} bytes in {} ms).", checkpointId, completedCheckpoint.getStateSize(), completedCheckpoint.getDuration());
    if (LOG.isDebugEnabled()) {
        StringBuilder builder = new StringBuilder();
        builder.append("Checkpoint state: ");
        for (TaskState state : completedCheckpoint.getTaskStates().values()) {
            builder.append(state);
            builder.append(", ");
        }
        // Remove last two chars ", "
        builder.setLength(builder.length() - 2);
        LOG.debug(builder.toString());
    }
    // send the "notify complete" call to all vertices
    final long timestamp = completedCheckpoint.getTimestamp();
    for (ExecutionVertex ev : tasksToCommitTo) {
        Execution ee = ev.getCurrentExecutionAttempt();
        if (ee != null) {
            ee.notifyCheckpointComplete(checkpointId, timestamp);
        }
    }
}
Also used : Execution(org.apache.flink.runtime.executiongraph.Execution) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex)

Example 49 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinatorTest method testMinDelayBetweenSavepoints.

/**
	 * Tests that no minimum delay between savepoints is enforced.
	 */
@Test
public void testMinDelayBetweenSavepoints() throws Exception {
    JobID jobId = new JobID();
    final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
    ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
    CheckpointCoordinator coord = new CheckpointCoordinator(jobId, 100000, 200000, // very long min delay => should not affect savepoints
    100000000L, 1, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    Future<CompletedCheckpoint> savepoint0 = coord.triggerSavepoint(0, savepointDir);
    assertFalse("Did not trigger savepoint", savepoint0.isDone());
    Future<CompletedCheckpoint> savepoint1 = coord.triggerSavepoint(1, savepointDir);
    assertFalse("Did not trigger savepoint", savepoint1.isDone());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Test(org.junit.Test)

Example 50 with ExecutionVertex

use of org.apache.flink.runtime.executiongraph.ExecutionVertex in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.

/**
	 * This test triggers a checkpoint and then sends a decline checkpoint message from
	 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
	 * checkpoint is triggered.
	 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
    try {
        final JobID jid = new JobID();
        final long timestamp = System.currentTimeMillis();
        // create some mock Execution vertices that receive the checkpoint trigger messages
        final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
        final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
        ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
        ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
        // set up the coordinator and validate the initial state
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // trigger the first checkpoint. this should succeed
        assertTrue(coord.triggerCheckpoint(timestamp, false));
        // validate that we have a pending checkpoint
        assertEquals(1, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // we have one task scheduled that will cancel after timeout
        assertEquals(1, coord.getNumScheduledTasks());
        long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);
        assertNotNull(checkpoint);
        assertEquals(checkpointId, checkpoint.getCheckpointId());
        assertEquals(timestamp, checkpoint.getCheckpointTimestamp());
        assertEquals(jid, checkpoint.getJobId());
        assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
        assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
        assertEquals(0, checkpoint.getTaskStates().size());
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // check that the vertices received the trigger checkpoint message
        verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
        verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
        CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
        // acknowledge from one of the tasks
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
        assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
        assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // acknowledge the same task again (should not matter)
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // decline checkpoint from the other task, this should cancel the checkpoint
        // and trigger a new one
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
        assertTrue(checkpoint.isDiscarded());
        // the canceler is also removed
        assertEquals(0, coord.getNumScheduledTasks());
        // validate that we have no new pending checkpoint
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // decline again, nothing should happen
        // decline from the other task, nothing should happen
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId));
        assertTrue(checkpoint.isDiscarded());
        coord.shutdown(JobStatus.FINISHED);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)65 Test (org.junit.Test)47 JobID (org.apache.flink.api.common.JobID)42 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)41 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)23 IOException (java.io.IOException)15 Execution (org.apache.flink.runtime.executiongraph.Execution)15 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)15 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)12 HashMap (java.util.HashMap)10 ArrayList (java.util.ArrayList)8 TriggerStackTraceSample (org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample)8 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)7 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)5 IntermediateResultPartition (org.apache.flink.runtime.executiongraph.IntermediateResultPartition)5 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)5 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)5 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)5 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)5