Search in sources :

Example 1 with DeclineCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointComplex.

/**
	 * This test triggers two checkpoints and then sends a decline message from one of the tasks
	 * for the first checkpoint. This should discard the first checkpoint while not triggering
	 * a new checkpoint because a later checkpoint is already in progress.
	 */
@Test
public void testTriggerAndDeclineCheckpointComplex() {
    try {
        final JobID jid = new JobID();
        final long timestamp = System.currentTimeMillis();
        // create some mock Execution vertices that receive the checkpoint trigger messages
        final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
        final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
        ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
        ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
        // set up the coordinator and validate the initial state
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        assertEquals(0, coord.getNumScheduledTasks());
        // trigger the first checkpoint. this should succeed
        assertTrue(coord.triggerCheckpoint(timestamp, false));
        // trigger second checkpoint, should also succeed
        assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
        // validate that we have a pending checkpoint
        assertEquals(2, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        assertEquals(2, coord.getNumScheduledTasks());
        Iterator<Map.Entry<Long, PendingCheckpoint>> it = coord.getPendingCheckpoints().entrySet().iterator();
        long checkpoint1Id = it.next().getKey();
        long checkpoint2Id = it.next().getKey();
        PendingCheckpoint checkpoint1 = coord.getPendingCheckpoints().get(checkpoint1Id);
        PendingCheckpoint checkpoint2 = coord.getPendingCheckpoints().get(checkpoint2Id);
        assertNotNull(checkpoint1);
        assertEquals(checkpoint1Id, checkpoint1.getCheckpointId());
        assertEquals(timestamp, checkpoint1.getCheckpointTimestamp());
        assertEquals(jid, checkpoint1.getJobId());
        assertEquals(2, checkpoint1.getNumberOfNonAcknowledgedTasks());
        assertEquals(0, checkpoint1.getNumberOfAcknowledgedTasks());
        assertEquals(0, checkpoint1.getTaskStates().size());
        assertFalse(checkpoint1.isDiscarded());
        assertFalse(checkpoint1.isFullyAcknowledged());
        assertNotNull(checkpoint2);
        assertEquals(checkpoint2Id, checkpoint2.getCheckpointId());
        assertEquals(timestamp + 2, checkpoint2.getCheckpointTimestamp());
        assertEquals(jid, checkpoint2.getJobId());
        assertEquals(2, checkpoint2.getNumberOfNonAcknowledgedTasks());
        assertEquals(0, checkpoint2.getNumberOfAcknowledgedTasks());
        assertEquals(0, checkpoint2.getTaskStates().size());
        assertFalse(checkpoint2.isDiscarded());
        assertFalse(checkpoint2.isFullyAcknowledged());
        // check that the vertices received the trigger checkpoint message
        {
            verify(vertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint1Id), eq(timestamp), any(CheckpointOptions.class));
            verify(vertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint1Id), eq(timestamp), any(CheckpointOptions.class));
        }
        // check that the vertices received the trigger checkpoint message for the second checkpoint
        {
            verify(vertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint2Id), eq(timestamp + 2), any(CheckpointOptions.class));
            verify(vertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint2Id), eq(timestamp + 2), any(CheckpointOptions.class));
        }
        // decline checkpoint from one of the tasks, this should cancel the checkpoint
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpoint1Id));
        assertTrue(checkpoint1.isDiscarded());
        // validate that we have only one pending checkpoint left
        assertEquals(1, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        assertEquals(1, coord.getNumScheduledTasks());
        // validate that it is the same second checkpoint from earlier
        long checkpointIdNew = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        PendingCheckpoint checkpointNew = coord.getPendingCheckpoints().get(checkpointIdNew);
        assertEquals(checkpoint2Id, checkpointIdNew);
        assertNotNull(checkpointNew);
        assertEquals(checkpointIdNew, checkpointNew.getCheckpointId());
        assertEquals(jid, checkpointNew.getJobId());
        assertEquals(2, checkpointNew.getNumberOfNonAcknowledgedTasks());
        assertEquals(0, checkpointNew.getNumberOfAcknowledgedTasks());
        assertEquals(0, checkpointNew.getTaskStates().size());
        assertFalse(checkpointNew.isDiscarded());
        assertFalse(checkpointNew.isFullyAcknowledged());
        assertNotEquals(checkpoint1.getCheckpointId(), checkpointNew.getCheckpointId());
        // decline again, nothing should happen
        // decline from the other task, nothing should happen
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpoint1Id));
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpoint1Id));
        assertTrue(checkpoint1.isDiscarded());
        coord.shutdown(JobStatus.FINISHED);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) IOException(java.io.IOException) Test(org.junit.Test)

Example 2 with DeclineCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.

the class ActorGatewayCheckpointResponder method declineCheckpoint.

@Override
public void declineCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, Throwable reason) {
    DeclineCheckpoint decline = new DeclineCheckpoint(jobID, executionAttemptID, checkpointId, reason);
    actorGateway.tell(decline);
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)

Example 3 with DeclineCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.

/**
	 * This test triggers a checkpoint and then sends a decline checkpoint message from
	 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
	 * checkpoint is triggered.
	 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
    try {
        final JobID jid = new JobID();
        final long timestamp = System.currentTimeMillis();
        // create some mock Execution vertices that receive the checkpoint trigger messages
        final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
        final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
        ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
        ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
        // set up the coordinator and validate the initial state
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // trigger the first checkpoint. this should succeed
        assertTrue(coord.triggerCheckpoint(timestamp, false));
        // validate that we have a pending checkpoint
        assertEquals(1, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // we have one task scheduled that will cancel after timeout
        assertEquals(1, coord.getNumScheduledTasks());
        long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);
        assertNotNull(checkpoint);
        assertEquals(checkpointId, checkpoint.getCheckpointId());
        assertEquals(timestamp, checkpoint.getCheckpointTimestamp());
        assertEquals(jid, checkpoint.getJobId());
        assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
        assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
        assertEquals(0, checkpoint.getTaskStates().size());
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // check that the vertices received the trigger checkpoint message
        verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
        verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
        CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
        // acknowledge from one of the tasks
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
        assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
        assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // acknowledge the same task again (should not matter)
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
        assertFalse(checkpoint.isDiscarded());
        assertFalse(checkpoint.isFullyAcknowledged());
        // decline checkpoint from the other task, this should cancel the checkpoint
        // and trigger a new one
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
        assertTrue(checkpoint.isDiscarded());
        // the canceler is also removed
        assertEquals(0, coord.getNumScheduledTasks());
        // validate that we have no new pending checkpoint
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
        // decline again, nothing should happen
        // decline from the other task, nothing should happen
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
        coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId));
        assertTrue(checkpoint.isDiscarded());
        coord.shutdown(JobStatus.FINISHED);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) IOException(java.io.IOException) Test(org.junit.Test)

Example 4 with DeclineCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.

/**
	 * Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
	 * that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
	 * checkpointing messages from other jobs should not be touched. A late acknowledge
	 * message is an acknowledge message which arrives after the checkpoint has been declined.
	 *
	 * @throws Exception
	 */
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
    final JobID jobId = new JobID();
    final ExecutionAttemptID triggerAttemptId = new ExecutionAttemptID();
    final ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptId);
    final ExecutionAttemptID ackAttemptId1 = new ExecutionAttemptID();
    final ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptId1);
    final ExecutionAttemptID ackAttemptId2 = new ExecutionAttemptID();
    final ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptId2);
    final long timestamp = 1L;
    CheckpointCoordinator coord = new CheckpointCoordinator(jobId, 20000L, 20000L, 0L, 1, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { triggerVertex, ackVertex1, ackVertex2 }, new ExecutionVertex[0], new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
    assertTrue(coord.triggerCheckpoint(timestamp, false));
    assertEquals(1, coord.getNumberOfPendingCheckpoints());
    PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();
    long checkpointId = pendingCheckpoint.getCheckpointId();
    CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
    SubtaskState triggerSubtaskState = mock(SubtaskState.class);
    // acknowledge the first trigger vertex
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
    SubtaskState unknownSubtaskState = mock(SubtaskState.class);
    // receive an acknowledge message for an unknown vertex
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState));
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState, times(1)).discardState();
    SubtaskState differentJobSubtaskState = mock(SubtaskState.class);
    // receive an acknowledge message from an unknown job
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    // duplicate acknowledge message for the trigger vertex
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
    // duplicate acknowledge messages for a known vertex should not trigger discarding the state
    verify(triggerSubtaskState, never()).discardState();
    // let the checkpoint fail at the first ack vertex
    coord.receiveDeclineMessage(new DeclineCheckpoint(jobId, ackAttemptId1, checkpointId));
    assertTrue(pendingCheckpoint.isDiscarded());
    // check that we've cleaned up the already acknowledged state
    verify(triggerSubtaskState, times(1)).discardState();
    SubtaskState ackSubtaskState = mock(SubtaskState.class);
    // late acknowledge message from the second ack vertex
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, ackAttemptId2, checkpointId, new CheckpointMetrics(), ackSubtaskState));
    // check that we also cleaned up this state
    verify(ackSubtaskState, times(1)).discardState();
    // receive an acknowledge message from an unknown job
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
    // we should not interfere with different jobs
    verify(differentJobSubtaskState, never()).discardState();
    SubtaskState unknownSubtaskState2 = mock(SubtaskState.class);
    // receive an acknowledge message for an unknown vertex
    coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2));
    // we should discard acknowledge messages from an unknown vertex belonging to our job
    verify(unknownSubtaskState2, times(1)).discardState();
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 5 with DeclineCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.

the class JobMaster method declineCheckpoint.

// TODO: This method needs a leader session ID
@RpcMethod
public void declineCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointID, final Throwable reason) {
    final DeclineCheckpoint decline = new DeclineCheckpoint(jobID, executionAttemptID, checkpointID, reason);
    final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
    if (checkpointCoordinator != null) {
        getRpcService().execute(new Runnable() {

            @Override
            public void run() {
                try {
                    checkpointCoordinator.receiveDeclineMessage(decline);
                } catch (Exception e) {
                    log.error("Error in CheckpointCoordinator while processing {}", decline, e);
                }
            }
        });
    } else {
        log.error("Received DeclineCheckpoint message for job {} with no CheckpointCoordinator", jobGraph.getJobID());
    }
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) TimeoutException(java.util.concurrent.TimeoutException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) LeaderIdMismatchException(org.apache.flink.runtime.highavailability.LeaderIdMismatchException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Aggregations

DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)5 IOException (java.io.IOException)3 JobID (org.apache.flink.api.common.JobID)3 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)3 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)3 Test (org.junit.Test)3 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)2 TimeoutException (java.util.concurrent.TimeoutException)1 CheckpointCoordinator (org.apache.flink.runtime.checkpoint.CheckpointCoordinator)1 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)1 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)1 LeaderIdMismatchException (org.apache.flink.runtime.highavailability.LeaderIdMismatchException)1 PartitionProducerDisposedException (org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException)1 RpcMethod (org.apache.flink.runtime.rpc.RpcMethod)1