use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointComplex.
/**
* This test triggers two checkpoints and then sends a decline message from one of the tasks
* for the first checkpoint. This should discard the first checkpoint while not triggering
* a new checkpoint because a later checkpoint is already in progress.
*/
@Test
public void testTriggerAndDeclineCheckpointComplex() {
try {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(0, coord.getNumScheduledTasks());
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp, false));
// trigger second checkpoint, should also succeed
assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
// validate that we have a pending checkpoint
assertEquals(2, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(2, coord.getNumScheduledTasks());
Iterator<Map.Entry<Long, PendingCheckpoint>> it = coord.getPendingCheckpoints().entrySet().iterator();
long checkpoint1Id = it.next().getKey();
long checkpoint2Id = it.next().getKey();
PendingCheckpoint checkpoint1 = coord.getPendingCheckpoints().get(checkpoint1Id);
PendingCheckpoint checkpoint2 = coord.getPendingCheckpoints().get(checkpoint2Id);
assertNotNull(checkpoint1);
assertEquals(checkpoint1Id, checkpoint1.getCheckpointId());
assertEquals(timestamp, checkpoint1.getCheckpointTimestamp());
assertEquals(jid, checkpoint1.getJobId());
assertEquals(2, checkpoint1.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint1.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint1.getTaskStates().size());
assertFalse(checkpoint1.isDiscarded());
assertFalse(checkpoint1.isFullyAcknowledged());
assertNotNull(checkpoint2);
assertEquals(checkpoint2Id, checkpoint2.getCheckpointId());
assertEquals(timestamp + 2, checkpoint2.getCheckpointTimestamp());
assertEquals(jid, checkpoint2.getJobId());
assertEquals(2, checkpoint2.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint2.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint2.getTaskStates().size());
assertFalse(checkpoint2.isDiscarded());
assertFalse(checkpoint2.isFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
{
verify(vertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint1Id), eq(timestamp), any(CheckpointOptions.class));
verify(vertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint1Id), eq(timestamp), any(CheckpointOptions.class));
}
// check that the vertices received the trigger checkpoint message for the second checkpoint
{
verify(vertex1.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint2Id), eq(timestamp + 2), any(CheckpointOptions.class));
verify(vertex2.getCurrentExecutionAttempt(), times(1)).triggerCheckpoint(eq(checkpoint2Id), eq(timestamp + 2), any(CheckpointOptions.class));
}
// decline checkpoint from one of the tasks, this should cancel the checkpoint
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpoint1Id));
assertTrue(checkpoint1.isDiscarded());
// validate that we have only one pending checkpoint left
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
assertEquals(1, coord.getNumScheduledTasks());
// validate that it is the same second checkpoint from earlier
long checkpointIdNew = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpointNew = coord.getPendingCheckpoints().get(checkpointIdNew);
assertEquals(checkpoint2Id, checkpointIdNew);
assertNotNull(checkpointNew);
assertEquals(checkpointIdNew, checkpointNew.getCheckpointId());
assertEquals(jid, checkpointNew.getJobId());
assertEquals(2, checkpointNew.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpointNew.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpointNew.getTaskStates().size());
assertFalse(checkpointNew.isDiscarded());
assertFalse(checkpointNew.isFullyAcknowledged());
assertNotEquals(checkpoint1.getCheckpointId(), checkpointNew.getCheckpointId());
// decline again, nothing should happen
// decline from the other task, nothing should happen
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpoint1Id));
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpoint1Id));
assertTrue(checkpoint1.isDiscarded());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.
the class ActorGatewayCheckpointResponder method declineCheckpoint.
@Override
public void declineCheckpoint(JobID jobID, ExecutionAttemptID executionAttemptID, long checkpointId, Throwable reason) {
DeclineCheckpoint decline = new DeclineCheckpoint(jobID, executionAttemptID, checkpointId, reason);
actorGateway.tell(decline);
}
use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.
/**
* This test triggers a checkpoint and then sends a decline checkpoint message from
* one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
* checkpoint is triggered.
*/
@Test
public void testTriggerAndDeclineCheckpointSimple() {
try {
final JobID jid = new JobID();
final long timestamp = System.currentTimeMillis();
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// trigger the first checkpoint. this should succeed
assertTrue(coord.triggerCheckpoint(timestamp, false));
// validate that we have a pending checkpoint
assertEquals(1, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// we have one task scheduled that will cancel after timeout
assertEquals(1, coord.getNumScheduledTasks());
long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);
assertNotNull(checkpoint);
assertEquals(checkpointId, checkpoint.getCheckpointId());
assertEquals(timestamp, checkpoint.getCheckpointTimestamp());
assertEquals(jid, checkpoint.getJobId());
assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(0, checkpoint.getTaskStates().size());
assertFalse(checkpoint.isDiscarded());
assertFalse(checkpoint.isFullyAcknowledged());
// check that the vertices received the trigger checkpoint message
verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forFullCheckpoint());
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
// acknowledge from one of the tasks
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
assertFalse(checkpoint.isDiscarded());
assertFalse(checkpoint.isFullyAcknowledged());
// acknowledge the same task again (should not matter)
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId));
assertFalse(checkpoint.isDiscarded());
assertFalse(checkpoint.isFullyAcknowledged());
// decline checkpoint from the other task, this should cancel the checkpoint
// and trigger a new one
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
assertTrue(checkpoint.isDiscarded());
// the canceler is also removed
assertEquals(0, coord.getNumScheduledTasks());
// validate that we have no new pending checkpoint
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// decline again, nothing should happen
// decline from the other task, nothing should happen
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId));
coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId));
assertTrue(checkpoint.isDiscarded());
coord.shutdown(JobStatus.FINISHED);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.
the class CheckpointCoordinatorTest method testStateCleanupForLateOrUnknownMessages.
/**
* Tests that late acknowledge checkpoint messages are properly cleaned up. Furthermore it tests
* that unknown checkpoint messages for the same job a are cleaned up as well. In contrast
* checkpointing messages from other jobs should not be touched. A late acknowledge
* message is an acknowledge message which arrives after the checkpoint has been declined.
*
* @throws Exception
*/
@Test
public void testStateCleanupForLateOrUnknownMessages() throws Exception {
final JobID jobId = new JobID();
final ExecutionAttemptID triggerAttemptId = new ExecutionAttemptID();
final ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptId);
final ExecutionAttemptID ackAttemptId1 = new ExecutionAttemptID();
final ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptId1);
final ExecutionAttemptID ackAttemptId2 = new ExecutionAttemptID();
final ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptId2);
final long timestamp = 1L;
CheckpointCoordinator coord = new CheckpointCoordinator(jobId, 20000L, 20000L, 0L, 1, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { triggerVertex, ackVertex1, ackVertex2 }, new ExecutionVertex[0], new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
assertTrue(coord.triggerCheckpoint(timestamp, false));
assertEquals(1, coord.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();
long checkpointId = pendingCheckpoint.getCheckpointId();
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, 0L);
SubtaskState triggerSubtaskState = mock(SubtaskState.class);
// acknowledge the first trigger vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
SubtaskState unknownSubtaskState = mock(SubtaskState.class);
// receive an acknowledge message for an unknown vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState));
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState, times(1)).discardState();
SubtaskState differentJobSubtaskState = mock(SubtaskState.class);
// receive an acknowledge message from an unknown job
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
// duplicate acknowledge message for the trigger vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, triggerAttemptId, checkpointId, new CheckpointMetrics(), triggerSubtaskState));
// duplicate acknowledge messages for a known vertex should not trigger discarding the state
verify(triggerSubtaskState, never()).discardState();
// let the checkpoint fail at the first ack vertex
coord.receiveDeclineMessage(new DeclineCheckpoint(jobId, ackAttemptId1, checkpointId));
assertTrue(pendingCheckpoint.isDiscarded());
// check that we've cleaned up the already acknowledged state
verify(triggerSubtaskState, times(1)).discardState();
SubtaskState ackSubtaskState = mock(SubtaskState.class);
// late acknowledge message from the second ack vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, ackAttemptId2, checkpointId, new CheckpointMetrics(), ackSubtaskState));
// check that we also cleaned up this state
verify(ackSubtaskState, times(1)).discardState();
// receive an acknowledge message from an unknown job
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), differentJobSubtaskState));
// we should not interfere with different jobs
verify(differentJobSubtaskState, never()).discardState();
SubtaskState unknownSubtaskState2 = mock(SubtaskState.class);
// receive an acknowledge message for an unknown vertex
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jobId, new ExecutionAttemptID(), checkpointId, new CheckpointMetrics(), unknownSubtaskState2));
// we should discard acknowledge messages from an unknown vertex belonging to our job
verify(unknownSubtaskState2, times(1)).discardState();
}
use of org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint in project flink by apache.
the class JobMaster method declineCheckpoint.
// TODO: This method needs a leader session ID
@RpcMethod
public void declineCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointID, final Throwable reason) {
final DeclineCheckpoint decline = new DeclineCheckpoint(jobID, executionAttemptID, checkpointID, reason);
final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
if (checkpointCoordinator != null) {
getRpcService().execute(new Runnable() {
@Override
public void run() {
try {
checkpointCoordinator.receiveDeclineMessage(decline);
} catch (Exception e) {
log.error("Error in CheckpointCoordinator while processing {}", decline, e);
}
}
});
} else {
log.error("Received DeclineCheckpoint message for job {} with no CheckpointCoordinator", jobGraph.getJobID());
}
}
Aggregations